## preparing for the POOxM
## categorizing based on ensembl database
# (taken from 'FOXE1_and_methylation_report_CL_CLP_PC.Rmd')
library( dplyr )
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load( "newest_betas_prepared.RData" )
load( "reg_info_found_obj.RData" )
source( "common_variables.R" )
# 1. taking the unique CpGs
# the CpGs per gene are the same in each cleft type group:
betas.df.subgroup1 <- betas.df.subgroup[[ 1 ]]
gene.names <- unique( as.character( betas.df.subgroup1$name ) )
unique.CpGs.per.gene <- lapply( gene.names, function(x){
cur.df <- betas.df.subgroup1[ as.character( betas.df.subgroup1$name ) == x, ]
cur.df$CpG <- as.character( cur.df$CpG )
cur.unique.cpgs <- unique( cur.df$CpG )
cur.chr <- my.SNP.map$chr[ my.SNP.map$gene_name == x ]
cur.pos <- cur.df$position[ match( cur.unique.cpgs, cur.df$CpG ) ]
data.frame( CpG = cur.unique.cpgs, position = cur.pos, chr = cur.chr )
} )
names( unique.CpGs.per.gene ) <- gene.names
sapply( unique.CpGs.per.gene, dim )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,] 64 21 57 14 17 3 22 1 6 46 10 23
## [2,] 3 3 3 3 3 3 3 3 3 3 3 3
# 2. adding the information to the betas.df
betas.df.subgroup <- lapply( cleft.types, function( clt ){
betas.df.tmp <- betas.df.subgroup[[ clt ]]
betas.df.tmp$ensembl_reg_feat <- ""
betas.df.tmp$gene_name <- ""
unique.cpg.names <- as.character( do.call( rbind, unique.CpGs.per.gene )$CpG )
for( cpg in unique.cpg.names ){
cur.idx <- as.character( betas.df.tmp$CpG ) == cpg
cur.annot <- reg.info.found.df$so_name[ as.character( reg.info.found.df$CpG ) == cpg ]
if( length( cur.annot ) == 0 ){
next
}
if( length( cur.annot ) > 1 ){
cur.annot <- paste( cur.annot, collapse = ", " )
}
betas.df.tmp$ensembl_reg_feat[ cur.idx ] <- cur.annot
}
return( betas.df.tmp )
} )
names( betas.df.subgroup ) <- cleft.types
sapply( betas.df.subgroup, function( x ){
table( x$ensembl_reg_feat )
} )
## CLO CL/P CPO CLP cntrl
## 18128 47344 24640 29216 80256
## enhancer 1133 2959 1540 1826 5016
## promoter 9270 24210 12600 14940 41040
## promoter_flanking_region 721 1883 980 1162 3192
# 3. creating the matrices for "promoter" regions
chosen.reg.feat <- "promoter"
betas.df.subgroup.promoter.list <- list()
betas.matrix.promoter.list <- lapply( cleft.types, function( clt ){
cur.betas.df <- betas.df.subgroup[[ clt ]]
which.betas.promoter <- grepl( x = cur.betas.df$ensembl_reg_feat,
pattern = chosen.reg.feat, fixed = TRUE )
betas.promoter.tmp <- cur.betas.df[ which.betas.promoter, ]
# print( table( betas.promoter.tmp$ensembl_reg_feat ) )
betas.promoter.tmp$name <- as.character( betas.promoter.tmp$name )
betas.promoter.tmp$indiv <- as.character( betas.promoter.tmp$indiv )
betas.df.subgroup.promoter.list[[ clt ]] <<- betas.promoter.tmp
betas.matrix.4PC.ensembl.promoter.list <- list()
for( gene in unique( betas.promoter.tmp$name ) ){
betas.df.tmp <- filter( betas.promoter.tmp, name == gene )
cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
indiv.rows <- which( betas.df.tmp$indiv == x )
tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
colnames( out ) <- tmp$CpG
out
} )
common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
for( i in 1:length( betas.matrix.tmp ) ){
cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
colnames( cur.matrix.tmp ) <- NULL
betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
}
which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
betas.matrix.4PC.ensembl.promoter.list <- c( betas.matrix.4PC.ensembl.promoter.list,
list( betas.matrix.tmp.out ) )
}
names( betas.matrix.4PC.ensembl.promoter.list ) <- unique( betas.promoter.tmp$name )
# sapply( betas.matrix.4PC.ensembl.promoter.list, dim )
return( betas.matrix.4PC.ensembl.promoter.list )
} )
names( betas.matrix.promoter.list ) <- cleft.types
lapply( betas.matrix.promoter.list, function( x ){
sapply( x, dim )
} )
## $CLO
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,] 103 103 103 103 103 103 103 103 103 103 103
## [2,] 19 1 14 1 1 1 6 1 30 2 21
##
## $`CL/P`
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,] 269 269 269 269 269 269 269 269 269 269 269
## [2,] 19 1 14 1 1 1 6 1 30 2 21
##
## $CPO
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,] 140 140 140 140 140 140 140 140 140 140 140
## [2,] 19 1 14 1 1 1 6 1 30 2 21
##
## $CLP
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,] 166 166 166 166 166 166 166 166 166 166 166
## [2,] 19 1 14 1 1 1 6 1 30 2 21
##
## $cntrl
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 SPRY2 TPM1 NOG1 MAFB
## [1,] 456 456 456 456 456 456 456 456 456 456 456
## [2,] 19 1 14 1 1 1 6 1 30 2 21
sapply( betas.df.subgroup.promoter.list, function( x ){
table( x$ensembl_reg_feat )
} )
## CLO CL/P CPO CLP cntrl
## promoter 9270 24210 12600 14940 41040
## promoter_flanking_region 721 1883 980 1162 3192
# 4. creating the matrices for "enhancer"
chosen.reg.feat2 <- c( "enhancer" )
betas.df.subgroup.enhancer.list <- list()
betas.matrix.enhancer.list <- lapply( cleft.types, function( clt ){
cur.betas.df <- betas.df.subgroup[[ clt ]]
which.betas.enhancer <- grepl( x = cur.betas.df$ensembl_reg_feat,
pattern = chosen.reg.feat2, fixed = TRUE )
betas.enhancer.tmp <- cur.betas.df[ which.betas.enhancer, ]
dim( betas.enhancer.tmp )
head( betas.enhancer.tmp )
table( betas.enhancer.tmp$ensembl_reg_feat )
betas.enhancer.tmp$name <- as.character( betas.enhancer.tmp$name )
betas.enhancer.tmp$indiv <- as.character( betas.enhancer.tmp$indiv )
betas.df.subgroup.enhancer.list[[ clt ]] <<- betas.enhancer.tmp
betas.matrix.4PC.enhancer.list <- list()
for( gene in unique( betas.enhancer.tmp$name ) ){
betas.df.tmp <- betas.enhancer.tmp[ betas.enhancer.tmp$name == gene, ]
cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
indiv.rows <- which( betas.df.tmp$indiv == x )
tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
colnames( out ) <- tmp$CpG
out
} )
common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
for( i in 1:length( betas.matrix.tmp ) ){
cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
colnames( cur.matrix.tmp ) <- NULL
betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
}
which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
betas.matrix.4PC.enhancer.list <- c( betas.matrix.4PC.enhancer.list,
list( betas.matrix.tmp.out ) )
}
names( betas.matrix.4PC.enhancer.list ) <- unique( betas.enhancer.tmp$name )
return( betas.matrix.4PC.enhancer.list )
} )
names( betas.matrix.enhancer.list ) <- cleft.types
lapply( betas.matrix.enhancer.list, function( x ){
sapply( x, dim )
} )
## $CLO
## PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,] 103 103 103 103 103 103
## [2,] 1 6 1 1 1 1
##
## $`CL/P`
## PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,] 269 269 269 269 269 269
## [2,] 1 6 1 1 1 1
##
## $CPO
## PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,] 140 140 140 140 140 140
## [2,] 1 6 1 1 1 1
##
## $CLP
## PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,] 166 166 166 166 166 166
## [2,] 1 6 1 1 1 1
##
## $cntrl
## PAX7 ABCA4 THADA 8q24 KIAA1598 SPRY2
## [1,] 456 456 456 456 456 456
## [2,] 1 6 1 1 1 1
sapply( betas.df.subgroup.enhancer.list, function( x ){
table( x$ensembl_reg_feat )
} )
## CLO.enhancer CL/P.enhancer CPO.enhancer CLP.enhancer cntrl.enhancer
## 1133 2959 1540 1826 5016
# 5. saving the data
save( betas.matrix.promoter.list, betas.matrix.enhancer.list,
betas.df.subgroup.promoter.list, betas.df.subgroup.enhancer.list,
file = "new_betas_matrices_ensembl_promoter_enhancer.RData" )
# 6. now, finding the gene regions, and creating the matrix
load( "gene_info_found_obj.RData" )
head( gene.info.found.df )
## orig_gene_name CpG ensembl_gene_id start_position end_position
## 1 PAX7 cg00065215 ENSG00000009709 18957500 19075360
## 2 PAX7 cg00855990 ENSG00000009709 18957500 19075360
## 3 PAX7 cg01797257 ENSG00000009709 18957500 19075360
## 4 PAX7 cg02178648 ENSG00000009709 18957500 19075360
## 5 PAX7 cg02856716 ENSG00000009709 18957500 19075360
## 6 PAX7 cg04094653 ENSG00000009709 18957500 19075360
## external_gene_name
## 1 PAX7
## 2 PAX7
## 3 PAX7
## 4 PAX7
## 5 PAX7
## 6 PAX7
betas.df.subgroup.gene.list <- list()
betas.matrix.gene.list <- lapply( cleft.types, function( clt ){
betas.df.tmp <- betas.df.subgroup[[ clt ]]
for( cpg in unique( gene.info.found.df$CpG ) ){
cur.cpg.idx <- which( betas.df.tmp$CpG == cpg )
cur.gene.name <- as.character( dplyr::filter( gene.info.found.df, CpG == cpg )$external_gene_name )
if( length( cur.gene.name ) > 1 ){
cur.gene.name <- paste( cur.gene.name, collapse = "," )
}
betas.df.tmp$gene_name[ cur.cpg.idx ] <- cur.gene.name
}
# table( betas.df.tmp$gene_name)
betas.df.subgroup.gene <- betas.df.tmp[ betas.df.tmp$gene_name != "", ]
betas.df.subgroup.gene$name <- as.character( betas.df.subgroup.gene$name )
betas.df.subgroup.gene$indiv <- as.character( betas.df.subgroup.gene$indiv )
betas.df.subgroup.gene.list[[ clt ]] <<- betas.df.subgroup.gene
betas.matrix.4PC.gene.list.all <- list()
for( gene in unique( betas.df.subgroup.gene$name ) ){
betas.df.tmp <- betas.df.subgroup.gene[ betas.df.subgroup.gene$name == gene, ]
cpgs.unique.tmp <- unique( betas.df.tmp$CpG )
betas.matrix.tmp <- lapply( unique( betas.df.tmp$indiv ), function( x ){
indiv.rows <- which( betas.df.tmp$indiv == x )
tmp <- betas.df.tmp[ indiv.rows, c( "CpG", "beta" ) ]
out <- matrix( as.numeric( tmp$beta ), nrow = 1 )
colnames( out ) <- tmp$CpG
out
} )
common.cpgs <- unique( unlist( lapply( betas.matrix.tmp, colnames ) ) )
betas.matrix.tmp.out <- matrix( NA, nrow = length( betas.matrix.tmp ), ncol =
length( common.cpgs ), dimnames = list( NULL, common.cpgs ) )
for( i in 1:length( betas.matrix.tmp ) ){
cur.matrix.tmp <- betas.matrix.tmp[[ i ]]
which.cols <- match( colnames( cur.matrix.tmp ), common.cpgs )
colnames( cur.matrix.tmp ) <- NULL
betas.matrix.tmp.out[ i,which.cols ] <- cur.matrix.tmp
}
which.cols.nas <- apply( betas.matrix.tmp.out, 2, function(x){ any( is.na( x ) ) } )
betas.matrix.tmp.out <- betas.matrix.tmp.out[ ,!which.cols.nas, drop = FALSE ]
rownames( betas.matrix.tmp.out ) <- unique( betas.df.tmp$indiv )
betas.matrix.4PC.gene.list.all <- c( betas.matrix.4PC.gene.list.all,
list( betas.matrix.tmp.out ) )
}
names( betas.matrix.4PC.gene.list.all ) <- unique( betas.df.subgroup.gene$name )
return( betas.matrix.4PC.gene.list.all )
} )
names( betas.matrix.gene.list ) <- cleft.types
lapply( betas.matrix.gene.list, function( x ){
sapply( x, dim )
} )
## $CLO
## PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,] 103 103 103 103 103 103 103 103
## [2,] 54 16 27 14 7 1 29 9
##
## $`CL/P`
## PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,] 269 269 269 269 269 269 269 269
## [2,] 54 16 27 14 7 1 29 9
##
## $CPO
## PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,] 140 140 140 140 140 140 140 140
## [2,] 54 16 27 14 7 1 29 9
##
## $CLP
## PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,] 166 166 166 166 166 166 166 166
## [2,] 54 16 27 14 7 1 29 9
##
## $cntrl
## PAX7 ABCA4 IRF6 THADA FOXE1 KIAA1598 TPM1 MAFB
## [1,] 456 456 456 456 456 456 456 456
## [2,] 54 16 27 14 7 1 29 9
save( betas.matrix.gene.list, betas.df.subgroup.gene.list,
file = "new_betas_df_subgroup_gene.RData" )