# ==== preparing the new betas ====
library( ff )
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
##
## Attaching package: 'bit'
## The following object is masked from 'package:base':
##
## xor
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/jro049/AppData/Local/Temp/RtmpCGlgFl"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush" -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==1030781665.28 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==51539083264 -- consider a different value for tuning your system
##
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
##
## clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
##
## write.csv, write.csv2
## The following objects are masked from 'package:base':
##
## is.factor, is.ordered
library( tidyr )
## Warning: package 'tidyr' was built under R version 3.5.3
library( plyr )
library( dplyr )
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
my.wd <- getwd()
setwd( "../Haplin_metylering/Haplin/R/" )
all.funcs <- list.files()
invisible( sapply( all.funcs, source ) )
setwd( my.wd )
.onLoad( "Haplin", "Haplin" )
source( "common_variables.R" )
# 1. reading new betas
betas.ff <- envDataLoad( "newest_betas" )
betas.ff <- betas.ff[[1]]
dim( betas.ff )
## [1] 462248 868
# since the new betas have different indiv.ids, it's best to translate it back to the old scheme
link.old.new <- read.table( "../ORIG_DATA/idlink.csv", sep = ",", header = TRUE )
link.old.new$sampleID <- sub( "@", ".", x = link.old.new$sampleID, fixed = TRUE )
colnames( betas.ff ) <- link.old.new$sampleID[ match( colnames( betas.ff ), link.old.new$id ) ]
# 2. gathering info about the SNPs
cpg.info <- read.csv( "../ORIG_DATA/cpg_info_all.csv" )
cpg.info$TargetID <- as.character( cpg.info$TargetID )
rownames( cpg.info ) <- cpg.info$TargetID
cpg.info <- cpg.info[ !is.na(cpg.info$MAPINFO), ]
load( "reg_info_found_obj.RData" )
my.SNP.map
## Marker chr coordinate gene_name
## 1 rs742071 1 18979874 PAX7
## 2 rs560426 1 94553438 ABCA4
## 3 rs642961 1 209989270 IRF6
## 4 rs7590268 2 43540125 THADA
## 5 rs12543318 8 88868340 8q21.3
## 6 rs987525 8 129946154 8q24
## 7 rs3758249 9 100614140 FOXE1
## 8 rs7078160 10 118827560 KIAA1598
## 9 rs8001641 13 80692811 SPRY2
## 10 rs1873147 15 63312632 TPM1
## 11 rs227731 17 54773238 NOG1
## 12 rs13041247 20 39269074 MAFB
str( my.CpGs )
## List of 12
## $ PAX7 :'data.frame': 65 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 1537 18026 36799 44611 54358 58068 82213 82899 82958 85192 ...
## ..$ coord: int [1:65] 18959268 18959354 18969652 18964181 18931477 18993307 18973203 18963958 18956762 18972430 ...
## ..$ chr : num [1:65] 1 1 1 1 1 1 1 1 1 1 ...
## $ ABCA4 :'data.frame': 21 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 29892 37063 39339 39372 82354 87369 90016 92000 134710 176261 ...
## ..$ coord: int [1:21] 94579396 94534645 94511166 94560851 94571619 94559714 94585452 94586592 94587076 94514548 ...
## ..$ chr : num [1:21] 1 1 1 1 1 1 1 1 1 1 ...
## $ IRF6 :'data.frame': 60 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 1048 13557 20816 69785 75813 75839 82334 85157 87435 91289 ...
## ..$ coord: int [1:60] 209958035 209956565 209979345 209956950 209958185 210000559 210001371 210000258 209979756 210001279 ...
## ..$ chr : num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
## $ THADA :'data.frame': 15 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 34043 119119 130226 200497 215438 226075 260389 275467 336938 347117 ...
## ..$ coord: int [1:15] 43521066 43567990 43573198 43515279 43584626 43523983 43560368 43540749 43495541 43555466 ...
## ..$ chr : num [1:15] 2 2 2 2 2 2 2 2 2 2 ...
## $ 8q21.3 :'data.frame': 17 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 315 32865 47177 66988 71522 83870 138435 194758 216020 240561 ...
## ..$ coord: int [1:17] 88885254 88886051 88886432 88845118 88886262 88889648 88886306 88885186 88886243 88884156 ...
## ..$ chr : num [1:17] 8 8 8 8 8 8 8 8 8 8 ...
## $ 8q24 :'data.frame': 3 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 59093 300078 398881
## ..$ coord: int [1:3] 129995502 129912602 129985596
## ..$ chr : num [1:3] 8 8 8
## $ FOXE1 :'data.frame': 23 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 5964 37840 44224 80592 82604 86904 200690 212239 247701 250419 ...
## ..$ coord: int [1:23] 100611516 100612587 100615244 100618041 100608962 100564659 100619991 100615357 100566588 100614879 ...
## ..$ chr : num [1:23] 9 9 9 9 9 9 9 9 9 9 ...
## $ KIAA1598:'data.frame': 1 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 399382
## ..$ coord: int 118855542
## ..$ chr : num 10
## $ SPRY2 :'data.frame': 6 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 35100 178737 361732 373559 447673 457287
## ..$ coord: int [1:6] 80726306 80657784 80706924 80670608 80715583 80644774
## ..$ chr : num [1:6] 13 13 13 13 13 13
## $ TPM1 :'data.frame': 49 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 11063 11534 13425 21584 64871 77283 81337 84259 108975 109045 ...
## ..$ coord: int [1:49] 63333846 63337496 63340700 63356886 63270641 63278936 63334937 63334506 63314579 63340331 ...
## ..$ chr : num [1:49] 15 15 15 15 15 15 15 15 15 15 ...
## $ NOG1 :'data.frame': 10 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 65345 84453 105461 139939 161457 394385 435162 435758 465614 483664
## ..$ coord: int [1:10] 54756137 54785335 54756052 54821256 54776791 54756378 54794017 54821128 54802123 54793185
## ..$ chr : num [1:10] 17 17 17 17 17 17 17 17 17 17
## $ MAFB :'data.frame': 23 obs. of 3 variables:
## ..$ id : Factor w/ 485577 levels "cg00000029","cg00000108",..: 12241 34306 50900 65551 110393 130009 178445 232661 238963 250747 ...
## ..$ coord: int [1:23] 39315211 39316308 39317194 39309738 39318437 39312290 39312434 39318518 39316218 39314091 ...
## ..$ chr : num [1:23] 20 20 20 20 20 20 20 20 20 20 ...
# 4. choosing a subset of betas
which.rows.betas <- sapply( as.character( my.SNP.map$gene_name ), function(x){
cur.cpgs <- my.CpGs[[ x ]]
match( as.character( cur.cpgs$id ), rownames( betas.ff ) )
} )
names( which.rows.betas ) <- as.character( my.SNP.map$gene_name )
sapply( which.rows.betas, length )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598
## 65 21 60 15 17 3 23 1
## SPRY2 TPM1 NOG1 MAFB
## 6 49 10 23
my.CpGs.betas <- lapply( as.character( my.SNP.map$gene_name ), function(x){
cur.rows <- which.rows.betas[[ x ]]
cur.which.na <- is.na( cur.rows )
cur.rows <- cur.rows[ !cur.which.na ]
if( !is.null( cur.rows ) ){
tmp.betas <- betas.ff[ cur.rows, , drop = FALSE ]
data.frame( CpG = rownames( tmp.betas ), position = my.CpGs[[ x ]]$coord[ !cur.which.na ],
tmp.betas )
}
} )
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpCGlgFl/tmp/RtmpIuQl74/ff75e06668bfb1.ff
names( my.CpGs.betas ) <- as.character( my.SNP.map$gene_name )
sapply( my.CpGs.betas, dim )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,] 64 21 57 14 17 3 22 1 6 46 10 23
## [2,] 870 870 870 870 870 870 870 870 870 870 870 870
# 5. create M values from the beta values
m <- function( beta ){
log2( beta/( 1 - beta ) )
}
betas.chosen.CpGs.list <- lapply( as.character( my.SNP.map$gene_name ), function(x){
cur.betas <- my.CpGs.betas[[ x ]]
if( !is.null( cur.betas ) ){
tmp.df <- gather( cur.betas, indiv, beta, starts_with( "BAE" ) )
tmp.df <- arrange( tmp.df, CpG, indiv )
mvals.tmp <- m( tmp.df$beta )
cbind( name = x, tmp.df, mvalue = mvals.tmp )
}
} )
names( betas.chosen.CpGs.list ) <- as.character( my.SNP.map$gene_name )
lapply( betas.chosen.CpGs.list, function(x){
summary( x$mvalue )
} )
## $PAX7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9.450 -6.019 -4.755 -4.106 -3.128 5.739
##
## $ABCA4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.966 2.374 4.224 3.178 4.867 6.649
##
## $IRF6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9.554 -6.748 -4.827 -2.377 3.324 6.695
##
## $THADA
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.508 3.528 4.897 4.130 5.352 6.882
##
## $`8q21.3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.052 4.547 5.193 5.033 5.713 7.097
##
## $`8q24`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6327 2.3549 5.0024 4.1278 5.4508 6.4113
##
## $FOXE1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.913 -6.069 -4.905 -3.812 -2.530 5.686
##
## $KIAA1598
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.150 4.927 5.151 5.123 5.325 6.085
##
## $SPRY2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.253 2.923 4.349 4.244 5.510 7.036
##
## $TPM1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.3246 -6.4510 0.6053 -0.8714 4.4662 6.7038
##
## $NOG1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.2749 -4.2558 2.8391 0.4788 4.8034 6.2590
##
## $MAFB
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9.242 -6.313 -4.988 -4.299 -3.668 4.870
betas.chosen.CpGs.df <- do.call( what = "rbind", betas.chosen.CpGs.list )
which.null <- unlist( lapply( betas.chosen.CpGs.list, is.null ) )
which.null
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## SPRY2 TPM1 NOG1 MAFB
## FALSE FALSE FALSE FALSE
gene.labels <- paste( as.character( my.SNP.map$gene_name[ !which.null ] ), "( chr",
my.SNP.map$chr[ !which.null ], ")" )
names( gene.labels ) <- as.character( my.SNP.map$gene_name[ !which.null ] )
gene.labels
## PAX7 ABCA4 IRF6
## "PAX7 ( chr 1 )" "ABCA4 ( chr 1 )" "IRF6 ( chr 1 )"
## THADA 8q21.3 8q24
## "THADA ( chr 2 )" "8q21.3 ( chr 8 )" "8q24 ( chr 8 )"
## FOXE1 KIAA1598 SPRY2
## "FOXE1 ( chr 9 )" "KIAA1598 ( chr 10 )" "SPRY2 ( chr 13 )"
## TPM1 NOG1 MAFB
## "TPM1 ( chr 15 )" "NOG1 ( chr 17 )" "MAFB ( chr 20 )"
# 6. choosing the individuals with genetic data also
all.indiv <- betas.chosen.CpGs.df$indiv
all.indiv <- t( sapply( all.indiv, function( x ){ unlist( strsplit( x, split = ".",
fixed = TRUE ) ) } ) )
rownames( all.indiv ) <- NULL
betas.chosen.CpGs.df <- data.frame( betas.chosen.CpGs.df[ ,c( "name", "CpG", "position" ) ],
indiv = all.indiv[ ,1 ], indiv.ctd = all.indiv[ ,2 ],
betas.chosen.CpGs.df[ ,c( "beta", "mvalue" ) ], stringsAsFactors = FALSE )
all.indiv <- betas.chosen.CpGs.df$indiv
# 6a. matching individuals - separately for each cleft type
betas.df.subgroup <- lapply( cleft.types, function(x){
ids.final <- read.table( file.names[ x ], header = TRUE, stringsAsFactors = FALSE )
cat( "cleft type: ", x, "\n")
print( dim( ids.final ) )
betas.df.tmp <- c()
for( p in ids.final$ID.BSI ){
betas.df.tmp <- rbind( betas.df.tmp, betas.chosen.CpGs.df[
betas.chosen.CpGs.df$indiv == p, ] )
}
return( betas.df.tmp)
} )
## cleft type: CLO
## [1] 103 5
## cleft type: CL/P
## [1] 269 5
## cleft type: CPO
## [1] 140 5
## cleft type: CLP
## [1] 166 5
## cleft type: cntrl
## [1] 456 5
names( betas.df.subgroup ) <- cleft.types
sapply( betas.df.subgroup, dim )
## CLO CL/P CPO CLP cntrl
## [1,] 29252 76396 39760 47144 129504
## [2,] 7 7 7 7 7
# checking whether I took correct data
sapply( betas.df.subgroup, function(x){
length( unique( x$indiv ) )
} )
## CLO CL/P CPO CLP cntrl
## 103 269 140 166 456
# 7. saving the data
save( betas.df.subgroup, my.SNP.map, gene.labels, file = "newest_betas_prepared.RData" )