## preparing for the POOxM
## categorizing based on ensembl database
# (taken from 'FOXE1_and_methylation_report_CL_CLP_PC.Rmd')

library( dplyr )
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load( "newest_betas_prepared.RData" )
load( "reg_info_found_obj.RData" )
source( "common_variables.R" )
# 1. taking the unique CpGs

#   the CpGs per gene are the same in each cleft type group:
betas.df.subgroup1 <- betas.df.subgroup[[ 1 ]]
gene.names <- unique( as.character( betas.df.subgroup1$name ) )
unique.CpGs.per.gene <- lapply( gene.names, function(x){
    cur.df <- betas.df.subgroup1[ as.character( betas.df.subgroup1$name ) == x, ]
    cur.df$CpG <- as.character( cur.df$CpG )
    cur.unique.cpgs <- unique( cur.df$CpG )
    cur.chr <- my.SNP.map$chr[ my.SNP.map$gene_name == x ]
    cur.pos <- cur.df$position[ match( cur.unique.cpgs, cur.df$CpG ) ]
    data.frame( CpG = cur.unique.cpgs, position = cur.pos, chr = cur.chr )
} )
names( unique.CpGs.per.gene ) <- gene.names
sapply( unique.CpGs.per.gene, dim )
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,]   64    21   57    14     17    3    22        1     6   46   10   23
## [2,]    3     3    3     3      3    3     3        3     3    3    3    3
# 2. adding the information to the betas.df
betas.df.subgroup <- lapply( cleft.types, function( clt ){
  betas.df.tmp <- betas.df.subgroup[[ clt ]]
  betas.df.tmp$ensembl_reg_feat <- ""
  betas.df.tmp$gene_name <- ""
  unique.cpg.names <- as.character( do.call( rbind, unique.CpGs.per.gene )$CpG )
  for( cpg in unique.cpg.names ){
    cur.idx <- as.character( betas.df.tmp$CpG ) == cpg
    cur.annot <- reg.info.found.df$so_name[ as.character( reg.info.found.df$CpG ) == cpg ]
    if( length( cur.annot ) == 0 ){
        next
    }
    if( length( cur.annot ) > 1 ){
        cur.annot <- paste( cur.annot, collapse = ", " )
    }
    betas.df.tmp$ensembl_reg_feat[ cur.idx ] <- cur.annot
  }
  return( betas.df.tmp )
} )
names( betas.df.subgroup ) <- cleft.types
sapply( betas.df.subgroup, function( x ){
  table( x$ensembl_reg_feat )
} )
##                            CLO  CL/P   CPO   CLP cntrl
##                          18128 47344 24640 29216 80256
## enhancer                  1133  2959  1540  1826  5016
## promoter                  9270 24210 12600 14940 41040
## promoter_flanking_region   721  1883   980  1162  3192
# 3. creating the matrices for "promoter" regions
chosen.reg.feat <- "promoter"

betas.df.subgroup.promoter.list <- list()
betas.matrix.promoter.list <- lapply( cleft.types, function( clt ){
  cur.betas.df <- betas.df.subgroup[[ clt ]]
  which.betas.promoter <- grepl( x = cur.betas.df$ensembl_reg_feat,
    pattern = chosen.reg.feat, fixed = TRUE )
  betas.promoter.tmp <- cur.betas.df[ which.betas.promoter, ]
  # print( table( betas.promoter.tmp$ensembl_reg_feat ) )
  
  betas.promoter.tmp$name <- as.character( betas.promoter.tmp$name )
  betas.promoter.tmp$indiv <- as.character( betas.promoter.tmp$indiv )
  betas.df.subgroup.promoter.list[[ clt ]] <<- betas.promoter.tmp
  
  betas.matrix.4PC.ensembl.promoter.list <- list()
  for( gene in unique( betas.promoter.tmp$name ) ){
    betas.df.tmp <- filter( betas.promoter.tmp, name == gene )
    cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
    
    betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
        indiv.rows <- which( betas.df.tmp$indiv == x )
        tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
        out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
        colnames( out ) <- tmp$CpG
        out
    } )
    common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
    betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
            length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
    for( i in 1:length( betas.matrix.tmp ) ){
        cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
        which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
        colnames( cur.matrix.tmp ) <- NULL
        betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
    }
    which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
    betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
    
    rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
  
    betas.matrix.4PC.ensembl.promoter.list <- c( betas.matrix.4PC.ensembl.promoter.list,
        list( betas.matrix.tmp.out ) )
  }
  names( betas.matrix.4PC.ensembl.promoter.list ) <- unique( betas.promoter.tmp$name )
  # sapply( betas.matrix.4PC.ensembl.promoter.list, dim )
  return( betas.matrix.4PC.ensembl.promoter.list )
} )
names( betas.matrix.promoter.list ) <- cleft.types
lapply( betas.matrix.promoter.list, function( x ){
  sapply( x, dim )
} )
## $CLO
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,]  103   103  103   103    103  103   103   103  103  103  103
## [2,]   19     1   14     1      1    1     6     1   30    2   21
## 
## $`CL/P`
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,]  269   269  269   269    269  269   269   269  269  269  269
## [2,]   19     1   14     1      1    1     6     1   30    2   21
## 
## $CPO
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,]  140   140  140   140    140  140   140   140  140  140  140
## [2,]   19     1   14     1      1    1     6     1   30    2   21
## 
## $CLP
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,]  166   166  166   166    166  166   166   166  166  166  166
## [2,]   19     1   14     1      1    1     6     1   30    2   21
## 
## $cntrl
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,]  456   456  456   456    456  456   456   456  456  456  456
## [2,]   19     1   14     1      1    1     6     1   30    2   21
sapply( betas.df.subgroup.promoter.list, function( x ){
  table( x$ensembl_reg_feat )
} )
##                           CLO  CL/P   CPO   CLP cntrl
## promoter                 9270 24210 12600 14940 41040
## promoter_flanking_region  721  1883   980  1162  3192
# 4. creating the matrices for "enhancer"
chosen.reg.feat2 <- c( "enhancer" )

betas.df.subgroup.enhancer.list <- list()
betas.matrix.enhancer.list <- lapply( cleft.types, function( clt ){
  cur.betas.df <- betas.df.subgroup[[ clt ]]
  which.betas.enhancer <- grepl( x = cur.betas.df$ensembl_reg_feat,
    pattern = chosen.reg.feat2, fixed = TRUE )
  betas.enhancer.tmp <- cur.betas.df[ which.betas.enhancer, ]
  dim( betas.enhancer.tmp )
  head( betas.enhancer.tmp )
  table( betas.enhancer.tmp$ensembl_reg_feat )
  
  betas.enhancer.tmp$name <- as.character( betas.enhancer.tmp$name )
  betas.enhancer.tmp$indiv <- as.character( betas.enhancer.tmp$indiv )
  betas.df.subgroup.enhancer.list[[ clt ]] <<- betas.enhancer.tmp

  betas.matrix.4PC.enhancer.list <- list()
  for( gene in unique( betas.enhancer.tmp$name ) ){
    betas.df.tmp <- betas.enhancer.tmp[ betas.enhancer.tmp$name == gene, ]
    cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
    
    betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
        indiv.rows <- which( betas.df.tmp$indiv == x )
        tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
        out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
        colnames( out ) <- tmp$CpG
        out
    } )
    common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
    betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
            length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
    for( i in 1:length( betas.matrix.tmp ) ){
        cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
        which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
        colnames( cur.matrix.tmp ) <- NULL
        betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
    }
    which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
    betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
    
    rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
  
    betas.matrix.4PC.enhancer.list <- c( betas.matrix.4PC.enhancer.list,
        list( betas.matrix.tmp.out ) )
  }
  names( betas.matrix.4PC.enhancer.list ) <- unique( betas.enhancer.tmp$name )
  return( betas.matrix.4PC.enhancer.list )
} )
names( betas.matrix.enhancer.list ) <- cleft.types
lapply( betas.matrix.enhancer.list, function( x ){
  sapply( x, dim )
} )
## $CLO
##      PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,]  103   103   103  103      103   103
## [2,]    1     6     1    1        1     1
## 
## $`CL/P`
##      PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,]  269   269   269  269      269   269
## [2,]    1     6     1    1        1     1
## 
## $CPO
##      PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,]  140   140   140  140      140   140
## [2,]    1     6     1    1        1     1
## 
## $CLP
##      PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,]  166   166   166  166      166   166
## [2,]    1     6     1    1        1     1
## 
## $cntrl
##      PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,]  456   456   456  456      456   456
## [2,]    1     6     1    1        1     1
sapply( betas.df.subgroup.enhancer.list, function( x ){
  table( x$ensembl_reg_feat )
} )
##   CLO.enhancer  CL/P.enhancer   CPO.enhancer   CLP.enhancer cntrl.enhancer 
##           1133           2959           1540           1826           5016
# 5. saving the data
save( betas.matrix.promoter.list, betas.matrix.enhancer.list,
    betas.df.subgroup.promoter.list, betas.df.subgroup.enhancer.list,
    file = "new_betas_matrices_ensembl_promoter_enhancer.RData" )
# 6. now, finding the gene regions, and creating the matrix
load( "gene_info_found_obj.RData" )

head( gene.info.found.df )
##   orig_gene_name        CpG ensembl_gene_id start_position end_position
## 1           PAX7 cg00065215 ENSG00000009709       18957500     19075360
## 2           PAX7 cg00855990 ENSG00000009709       18957500     19075360
## 3           PAX7 cg01797257 ENSG00000009709       18957500     19075360
## 4           PAX7 cg02178648 ENSG00000009709       18957500     19075360
## 5           PAX7 cg02856716 ENSG00000009709       18957500     19075360
## 6           PAX7 cg04094653 ENSG00000009709       18957500     19075360
##   external_gene_name
## 1               PAX7
## 2               PAX7
## 3               PAX7
## 4               PAX7
## 5               PAX7
## 6               PAX7
betas.df.subgroup.gene.list <- list()
betas.matrix.gene.list <- lapply( cleft.types, function( clt ){
  betas.df.tmp <- betas.df.subgroup[[ clt ]]
  for( cpg in unique( gene.info.found.df$CpG ) ){
    cur.cpg.idx <- which( betas.df.tmp$CpG == cpg )
    cur.gene.name <- as.character( dplyr::filter( gene.info.found.df, CpG == cpg )$external_gene_name )
    if( length( cur.gene.name ) > 1 ){
      cur.gene.name <- paste( cur.gene.name, collapse = "," )
    }
    betas.df.tmp$gene_name[ cur.cpg.idx ] <- cur.gene.name
  }
  # table( betas.df.tmp$gene_name)
  betas.df.subgroup.gene <- betas.df.tmp[ betas.df.tmp$gene_name != "", ]
  
  betas.df.subgroup.gene$name <- as.character( betas.df.subgroup.gene$name )
  betas.df.subgroup.gene$indiv <- as.character( betas.df.subgroup.gene$indiv )
  betas.df.subgroup.gene.list[[ clt ]] <<- betas.df.subgroup.gene
  
  betas.matrix.4PC.gene.list.all <- list()
  for( gene in unique( betas.df.subgroup.gene$name ) ){
    betas.df.tmp <- betas.df.subgroup.gene[ betas.df.subgroup.gene$name == gene, ]
    cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
    
    betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
        indiv.rows <- which( betas.df.tmp$indiv == x )
        tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
        out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
        colnames( out ) <- tmp$CpG
        out
    } )
    common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
    betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
            length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
    for( i in 1:length( betas.matrix.tmp ) ){
        cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
        which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
        colnames( cur.matrix.tmp ) <- NULL
        betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
    }
    which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
    betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
    
    rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
  
    betas.matrix.4PC.gene.list.all <- c( betas.matrix.4PC.gene.list.all,
        list( betas.matrix.tmp.out ) )
  }
  names( betas.matrix.4PC.gene.list.all ) <- unique( betas.df.subgroup.gene$name )
  return( betas.matrix.4PC.gene.list.all )
} )
names( betas.matrix.gene.list ) <- cleft.types
lapply( betas.matrix.gene.list, function( x ){
  sapply( x, dim )
} )
## $CLO
##      PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,]  103   103  103   103   103      103  103  103
## [2,]   54    16   27    14     7        1   29    9
## 
## $`CL/P`
##      PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,]  269   269  269   269   269      269  269  269
## [2,]   54    16   27    14     7        1   29    9
## 
## $CPO
##      PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,]  140   140  140   140   140      140  140  140
## [2,]   54    16   27    14     7        1   29    9
## 
## $CLP
##      PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,]  166   166  166   166   166      166  166  166
## [2,]   54    16   27    14     7        1   29    9
## 
## $cntrl
##      PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,]  456   456  456   456   456      456  456  456
## [2,]   54    16   27    14     7        1   29    9
save( betas.matrix.gene.list, betas.df.subgroup.gene.list,
    file = "new_betas_df_subgroup_gene.RData" )