# ==== preparing the new betas ====

library( ff )
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## The following object is masked from 'package:base':
## 
##     xor
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/jro049/AppData/Local/Temp/RtmpCGlgFl"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==1030781665.28 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==51539083264 -- consider a different value for tuning your system
## 
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2
## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered
library( tidyr )
## Warning: package 'tidyr' was built under R version 3.5.3
library( plyr )
library( dplyr )
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
my.wd <- getwd()
setwd( "../Haplin_metylering/Haplin/R/" )
all.funcs <- list.files()
invisible( sapply( all.funcs, source ) )
setwd( my.wd )

.onLoad( "Haplin", "Haplin" )

source( "common_variables.R" )
# 1. reading new betas
betas.ff <- envDataLoad( "newest_betas" )
betas.ff <- betas.ff[[1]]
dim( betas.ff )
## [1] 462248    868
# since the new betas have different indiv.ids, it's best to translate it back to the old scheme
link.old.new <- read.table( "../ORIG_DATA/idlink.csv", sep = ",", header = TRUE )
link.old.new$sampleID <- sub( "@", ".", x = link.old.new$sampleID, fixed = TRUE )
colnames( betas.ff ) <- link.old.new$sampleID[ match( colnames( betas.ff ), link.old.new$id ) ]
# 2. gathering info about the SNPs
cpg.info <- read.csv( "../ORIG_DATA/cpg_info_all.csv" )

cpg.info$TargetID <- as.character( cpg.info$TargetID )
rownames( cpg.info ) <- cpg.info$TargetID
cpg.info <- cpg.info[ !is.na(cpg.info$MAPINFO), ]

load( "reg_info_found_obj.RData" )

my.SNP.map
##        Marker chr coordinate gene_name
## 1    rs742071   1   18979874      PAX7
## 2    rs560426   1   94553438     ABCA4
## 3    rs642961   1  209989270      IRF6
## 4   rs7590268   2   43540125     THADA
## 5  rs12543318   8   88868340    8q21.3
## 6    rs987525   8  129946154      8q24
## 7   rs3758249   9  100614140     FOXE1
## 8   rs7078160  10  118827560  KIAA1598
## 9   rs8001641  13   80692811     SPRY2
## 10  rs1873147  15   63312632      TPM1
## 11   rs227731  17   54773238      NOG1
## 12 rs13041247  20   39269074      MAFB
str( my.CpGs )
## List of 12
##  $ PAX7    :'data.frame':    65 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 1537 18026 36799 44611 54358 58068 82213 82899 82958 85192 ...
##   ..$ coord: int [1:65] 18959268 18959354 18969652 18964181 18931477 18993307 18973203 18963958 18956762 18972430 ...
##   ..$ chr  : num [1:65] 1 1 1 1 1 1 1 1 1 1 ...
##  $ ABCA4   :'data.frame':    21 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 29892 37063 39339 39372 82354 87369 90016 92000 134710 176261 ...
##   ..$ coord: int [1:21] 94579396 94534645 94511166 94560851 94571619 94559714 94585452 94586592 94587076 94514548 ...
##   ..$ chr  : num [1:21] 1 1 1 1 1 1 1 1 1 1 ...
##  $ IRF6    :'data.frame':    60 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 1048 13557 20816 69785 75813 75839 82334 85157 87435 91289 ...
##   ..$ coord: int [1:60] 209958035 209956565 209979345 209956950 209958185 210000559 210001371 210000258 209979756 210001279 ...
##   ..$ chr  : num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
##  $ THADA   :'data.frame':    15 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 34043 119119 130226 200497 215438 226075 260389 275467 336938 347117 ...
##   ..$ coord: int [1:15] 43521066 43567990 43573198 43515279 43584626 43523983 43560368 43540749 43495541 43555466 ...
##   ..$ chr  : num [1:15] 2 2 2 2 2 2 2 2 2 2 ...
##  $ 8q21.3  :'data.frame':    17 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 315 32865 47177 66988 71522 83870 138435 194758 216020 240561 ...
##   ..$ coord: int [1:17] 88885254 88886051 88886432 88845118 88886262 88889648 88886306 88885186 88886243 88884156 ...
##   ..$ chr  : num [1:17] 8 8 8 8 8 8 8 8 8 8 ...
##  $ 8q24    :'data.frame':    3 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 59093 300078 398881
##   ..$ coord: int [1:3] 129995502 129912602 129985596
##   ..$ chr  : num [1:3] 8 8 8
##  $ FOXE1   :'data.frame':    23 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 5964 37840 44224 80592 82604 86904 200690 212239 247701 250419 ...
##   ..$ coord: int [1:23] 100611516 100612587 100615244 100618041 100608962 100564659 100619991 100615357 100566588 100614879 ...
##   ..$ chr  : num [1:23] 9 9 9 9 9 9 9 9 9 9 ...
##  $ KIAA1598:'data.frame':    1 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 399382
##   ..$ coord: int 118855542
##   ..$ chr  : num 10
##  $ SPRY2   :'data.frame':    6 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 35100 178737 361732 373559 447673 457287
##   ..$ coord: int [1:6] 80726306 80657784 80706924 80670608 80715583 80644774
##   ..$ chr  : num [1:6] 13 13 13 13 13 13
##  $ TPM1    :'data.frame':    49 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 11063 11534 13425 21584 64871 77283 81337 84259 108975 109045 ...
##   ..$ coord: int [1:49] 63333846 63337496 63340700 63356886 63270641 63278936 63334937 63334506 63314579 63340331 ...
##   ..$ chr  : num [1:49] 15 15 15 15 15 15 15 15 15 15 ...
##  $ NOG1    :'data.frame':    10 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 65345 84453 105461 139939 161457 394385 435162 435758 465614 483664
##   ..$ coord: int [1:10] 54756137 54785335 54756052 54821256 54776791 54756378 54794017 54821128 54802123 54793185
##   ..$ chr  : num [1:10] 17 17 17 17 17 17 17 17 17 17
##  $ MAFB    :'data.frame':    23 obs. of  3 variables:
##   ..$ id   : Factor w/ 485577 levels "cg00000029","cg00000108",..: 12241 34306 50900 65551 110393 130009 178445 232661 238963 250747 ...
##   ..$ coord: int [1:23] 39315211 39316308 39317194 39309738 39318437 39312290 39312434 39318518 39316218 39314091 ...
##   ..$ chr  : num [1:23] 20 20 20 20 20 20 20 20 20 20 ...
# 4. choosing a subset of betas
which.rows.betas <- sapply( as.character( my.SNP.map$gene_name ), function(x){
    cur.cpgs <- my.CpGs[[ x ]]
    match( as.character( cur.cpgs$id ), rownames( betas.ff ) )
} )
names( which.rows.betas ) <- as.character( my.SNP.map$gene_name )
sapply( which.rows.betas, length )
##     PAX7    ABCA4     IRF6    THADA   8q21.3     8q24    FOXE1 KIAA1598 
##       65       21       60       15       17        3       23        1 
##    SPRY2     TPM1     NOG1     MAFB 
##        6       49       10       23
my.CpGs.betas <- lapply( as.character( my.SNP.map$gene_name ), function(x){
    cur.rows <- which.rows.betas[[ x ]]
    cur.which.na <- is.na( cur.rows )
    cur.rows <- cur.rows[ !cur.which.na ]
    if( !is.null( cur.rows ) ){
        tmp.betas <- betas.ff[ cur.rows, , drop = FALSE ]
        data.frame( CpG = rownames( tmp.betas ), position = my.CpGs[[ x ]]$coord[ !cur.which.na ],
            tmp.betas )
    }
} )
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpCGlgFl/tmp/RtmpIuQl74/ff75e06668bfb1.ff
names( my.CpGs.betas ) <- as.character( my.SNP.map$gene_name )

sapply( my.CpGs.betas, dim )
##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,]   64    21   57    14     17    3    22        1     6   46   10   23
## [2,]  870   870  870   870    870  870   870      870   870  870  870  870
# 5. create M values from the beta values
m <- function( beta ){
    log2( beta/( 1 - beta ) )
}

betas.chosen.CpGs.list <- lapply( as.character( my.SNP.map$gene_name ), function(x){
    cur.betas <- my.CpGs.betas[[ x ]]
    if( !is.null( cur.betas ) ){
        tmp.df <- gather( cur.betas, indiv, beta, starts_with( "BAE" ) )
        tmp.df <- arrange( tmp.df, CpG, indiv )
        mvals.tmp <- m( tmp.df$beta )
        cbind( name = x, tmp.df, mvalue = mvals.tmp )
    }
} )
names( betas.chosen.CpGs.list ) <- as.character( my.SNP.map$gene_name )
lapply( betas.chosen.CpGs.list, function(x){
     summary( x$mvalue )    
} )
## $PAX7
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.450  -6.019  -4.755  -4.106  -3.128   5.739 
## 
## $ABCA4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -8.966   2.374   4.224   3.178   4.867   6.649 
## 
## $IRF6
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.554  -6.748  -4.827  -2.377   3.324   6.695 
## 
## $THADA
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -3.508   3.528   4.897   4.130   5.352   6.882 
## 
## $`8q21.3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.052   4.547   5.193   5.033   5.713   7.097 
## 
## $`8q24`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.6327  2.3549  5.0024  4.1278  5.4508  6.4113 
## 
## $FOXE1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -8.913  -6.069  -4.905  -3.812  -2.530   5.686 
## 
## $KIAA1598
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.150   4.927   5.151   5.123   5.325   6.085 
## 
## $SPRY2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.253   2.923   4.349   4.244   5.510   7.036 
## 
## $TPM1
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -10.3246  -6.4510   0.6053  -0.8714   4.4662   6.7038 
## 
## $NOG1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -8.2749 -4.2558  2.8391  0.4788  4.8034  6.2590 
## 
## $MAFB
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.242  -6.313  -4.988  -4.299  -3.668   4.870
betas.chosen.CpGs.df <- do.call( what = "rbind", betas.chosen.CpGs.list )
which.null <- unlist( lapply( betas.chosen.CpGs.list, is.null ) )
which.null
##     PAX7    ABCA4     IRF6    THADA   8q21.3     8q24    FOXE1 KIAA1598 
##    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE 
##    SPRY2     TPM1     NOG1     MAFB 
##    FALSE    FALSE    FALSE    FALSE
gene.labels <- paste( as.character( my.SNP.map$gene_name[ !which.null ] ), "( chr",
        my.SNP.map$chr[ !which.null ], ")" )
names( gene.labels ) <- as.character( my.SNP.map$gene_name[ !which.null ] )
gene.labels
##                  PAX7                 ABCA4                  IRF6 
##      "PAX7 ( chr 1 )"     "ABCA4 ( chr 1 )"      "IRF6 ( chr 1 )" 
##                 THADA                8q21.3                  8q24 
##     "THADA ( chr 2 )"    "8q21.3 ( chr 8 )"      "8q24 ( chr 8 )" 
##                 FOXE1              KIAA1598                 SPRY2 
##     "FOXE1 ( chr 9 )" "KIAA1598 ( chr 10 )"    "SPRY2 ( chr 13 )" 
##                  TPM1                  NOG1                  MAFB 
##     "TPM1 ( chr 15 )"     "NOG1 ( chr 17 )"     "MAFB ( chr 20 )"
# 6. choosing the individuals with genetic data also
all.indiv <- betas.chosen.CpGs.df$indiv
all.indiv <- t( sapply( all.indiv, function( x ){ unlist( strsplit( x, split = ".",
    fixed = TRUE ) ) } ) )
rownames( all.indiv ) <- NULL
betas.chosen.CpGs.df <- data.frame( betas.chosen.CpGs.df[ ,c( "name", "CpG", "position" ) ],
    indiv = all.indiv[ ,1 ], indiv.ctd = all.indiv[ ,2 ],
    betas.chosen.CpGs.df[ ,c( "beta", "mvalue" ) ], stringsAsFactors = FALSE )
all.indiv <- betas.chosen.CpGs.df$indiv
# 6a. matching individuals - separately for each cleft type
betas.df.subgroup <- lapply( cleft.types, function(x){
  ids.final <- read.table( file.names[ x ], header = TRUE, stringsAsFactors = FALSE )
  cat( "cleft type: ", x, "\n")
  print( dim( ids.final ) )
  
  betas.df.tmp <- c()
  for( p in ids.final$ID.BSI ){
    betas.df.tmp <- rbind( betas.df.tmp, betas.chosen.CpGs.df[
      betas.chosen.CpGs.df$indiv == p, ] )
  }
  return( betas.df.tmp)
} )
## cleft type:  CLO 
## [1] 103   5
## cleft type:  CL/P 
## [1] 269   5
## cleft type:  CPO 
## [1] 140   5
## cleft type:  CLP 
## [1] 166   5
## cleft type:  cntrl 
## [1] 456   5
names( betas.df.subgroup ) <- cleft.types
sapply( betas.df.subgroup, dim )
##        CLO  CL/P   CPO   CLP  cntrl
## [1,] 29252 76396 39760 47144 129504
## [2,]     7     7     7     7      7
# checking whether I took correct data
sapply( betas.df.subgroup, function(x){
  length( unique( x$indiv ) )
} )
##   CLO  CL/P   CPO   CLP cntrl 
##   103   269   140   166   456
# 7. saving the data
save( betas.df.subgroup, my.SNP.map, gene.labels,   file = "newest_betas_prepared.RData" )