library( ff )
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## The following object is masked from 'package:base':
## 
##     xor
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==1030781665.28 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==51539083264 -- consider a different value for tuning your system
## 
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2
## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered
library( ggplot2 )
library( SuppDists )
library( Haplin )
library( dplyr )
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library( reshape2 )
library( matrixStats )
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
source( "gaphunter2.R" )
source( "../common_variables.R" )

map.ok <- read.table( "../new_SNPs.map", header = TRUE, stringsAsFactors = FALSE )

load( "../new_betas_matrices_ensembl_promoter_enhancer.RData" )
load( "../new_betas_df_subgroup_gene.RData" )
ls()
##  [1] "betas.df.subgroup.enhancer.list" "betas.df.subgroup.gene.list"    
##  [3] "betas.df.subgroup.promoter.list" "betas.matrix.enhancer.list"     
##  [5] "betas.matrix.gene.list"          "betas.matrix.promoter.list"     
##  [7] "cleft.types"                     "ffdata.file.names"              
##  [9] "file.names"                      "file.names.root"                
## [11] "gaphunter2"                      "map.ok"
# just testing on one case:
dim( betas.matrix.gene.list$CLO$PAX7 )
## [1] 103  54
matrix.tmp <- t( betas.matrix.gene.list$CLO$PAX7 )
gaphunter.res <- gaphunter2( matrix.tmp )
## [gaphunter] Using 54 probes and 103 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 2 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 0 gap signals driven by outliers from results.
gaphunter.res
## $proberesults
##            Groups Group1 Group2 Group3
## cg04276953      3      1     96      6
## cg12080079      3      1    101      1
## 
## $sampleresults
##            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## cg04276953    2    2    2    2    3    2    3    2    2     2     2     2
## cg12080079    2    2    2    2    2    2    2    2    2     2     2     2
##            [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22]
## cg04276953     2     2     2     2     2     2     2     2     2     1
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
## cg04276953     2     2     2     2     2     2     2     2     2     2
## cg12080079     2     2     2     2     3     2     2     2     2     2
##            [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42]
## cg04276953     2     2     2     2     2     2     2     2     2     2
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50] [,51] [,52]
## cg04276953     2     2     2     2     3     2     2     2     2     2
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
## cg04276953     2     2     2     2     2     2     2     2     2     2
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72]
## cg04276953     2     2     2     3     2     2     2     2     2     2
## cg12080079     2     2     2     1     2     2     2     2     2     2
##            [,73] [,74] [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82]
## cg04276953     2     2     2     2     2     2     2     2     2     2
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90] [,91] [,92]
## cg04276953     2     2     2     2     2     2     2     3     2     2
## cg12080079     2     2     2     2     2     2     2     2     2     2
##            [,93] [,94] [,95] [,96] [,97] [,98] [,99] [,100] [,101] [,102]
## cg04276953     2     2     2     2     2     3     2      2      2      2
## cg12080079     2     2     2     2     2     2     2      2      2      2
##            [,103]
## cg04276953      2
## cg12080079      2
## 
## $algorithm
## $algorithm$threshold
## [1] 0.05
## 
## $algorithm$outCutoff
## [1] 0.01
## 
## $algorithm$keepOutliers
## [1] FALSE
# combine the matrices to include all the individuals (per gene)
# -- for the gene regions --
sapply( betas.matrix.gene.list, length )
##   CLO  CL/P   CPO   CLP cntrl 
##     8     8     8     8     8
genes <- names( betas.matrix.gene.list[[ 1 ]] )
genes
## [1] "PAX7"     "ABCA4"    "IRF6"     "THADA"    "FOXE1"    "KIAA1598"
## [7] "TPM1"     "MAFB"
betas.matrix.gene.all <- lapply( genes, function( x ){
  tmp.list <- lapply( betas.matrix.gene.list, function( y ){
    return( y[[ x ]] )
  } )
  tmp.m <- do.call( rbind, tmp.list )
  unique.indiv <- unique( rownames( tmp.m ) )
  out.m <- tmp.m[ unique.indiv,, drop = FALSE ]
  return( out.m )
} )
names( betas.matrix.gene.all ) <- genes

# -- for the enhancer regions --
genes <- names( betas.matrix.enhancer.list[[ 1 ]] )
genes
## [1] "PAX7"     "ABCA4"    "THADA"    "8q24"     "KIAA1598" "SPRY2"
betas.matrix.enhancer.all <- lapply( genes, function( x ){
  tmp.list <- lapply( betas.matrix.enhancer.list, function( y ){
    return( y[[ x ]] )
  } )
  tmp.m <- do.call( rbind, tmp.list )
  unique.indiv <- unique( rownames( tmp.m ) )
  out.m <- tmp.m[ unique.indiv,, drop = FALSE ]
  return( out.m )
} )
names( betas.matrix.enhancer.all ) <- genes

# -- for the promoter regions --
genes <- names( betas.matrix.promoter.list[[ 1 ]] )
genes
##  [1] "PAX7"   "ABCA4"  "IRF6"   "THADA"  "8q21.3" "8q24"   "FOXE1" 
##  [8] "SPRY2"  "TPM1"   "NOG1"   "MAFB"
betas.matrix.promoter.all <- lapply( genes, function( x ){
  tmp.list <- lapply( betas.matrix.promoter.list, function( y ){
    return( y[[ x ]] )
  } )
  tmp.m <- do.call( rbind, tmp.list )
  unique.indiv <- unique( rownames( tmp.m ) )
  out.m <- tmp.m[ unique.indiv,, drop = FALSE ]
  return( out.m )
} )
names( betas.matrix.promoter.all ) <- genes



# running the gap hunter on all the subsets:
gaphunter.res.genes <- lapply( names( betas.matrix.gene.all ), function(x){
  cat( "   GENE: ",x, "\n")
  m <- t( betas.matrix.gene.all[[ x ]] )
  out <- gaphunter2( m )
  cat( "\n")
  
  return( out )
} )
##    GENE:  PAX7
## [gaphunter] Using 54 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 1 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 1 gap signals driven by outliers from results.
## 
##    GENE:  ABCA4
## [gaphunter] Using 16 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 2 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 0 gap signals driven by outliers from results.
## 
##    GENE:  IRF6
## [gaphunter] Using 27 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 1 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 1 gap signals driven by outliers from results.
## 
##    GENE:  THADA
## [gaphunter] Using 14 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 1 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 1 gap signals driven by outliers from results.
## 
##    GENE:  FOXE1
## [gaphunter] Using 7 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  KIAA1598
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  TPM1
## [gaphunter] Using 29 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  MAFB
## [gaphunter] Using 9 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
names( gaphunter.res.genes ) <- names( betas.matrix.gene.all )

gaphunter.res.prom <- lapply( names( betas.matrix.promoter.all ), function(x){
  cat( "   GENE: ",x, "\n")
  m <- t( betas.matrix.promoter.all[[ x ]] )
  out <- gaphunter2( m )
  cat( "\n")
  
  return( out )
} )
##    GENE:  PAX7
## [gaphunter] Using 19 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  ABCA4
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  IRF6
## [gaphunter] Using 14 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  THADA
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  8q21.3
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  8q24
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  FOXE1
## [gaphunter] Using 6 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  SPRY2
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  TPM1
## [gaphunter] Using 30 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  NOG1
## [gaphunter] Using 2 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  MAFB
## [gaphunter] Using 21 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 1 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 1 gap signals driven by outliers from results.
names( gaphunter.res.prom ) <- names( betas.matrix.promoter.all )

gaphunter.res.enh <- lapply( names( betas.matrix.enhancer.all ), function(x){
  cat( "   GENE: ",x, "\n")
  m <- t( betas.matrix.enhancer.all[[ x ]] )
  out <- gaphunter2( m )
  cat( "\n")
  
  return( out )
} )
##    GENE:  PAX7
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  ABCA4
## [gaphunter] Using 6 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  THADA
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  8q24
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  KIAA1598
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
## 
##    GENE:  SPRY2
## [gaphunter] Using 1 probes and 865 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found no gap signals.
names( gaphunter.res.enh ) <- names( betas.matrix.enhancer.all )
# create a list with only the gap CpGs

which.res.empty <- sapply( gaphunter.res.genes, function(x){
  if( is.null( x$proberesults ) ){
    return( TRUE )
  }
  return( nrow( x$proberesults ) == 0 )
} )
gaphunter.res.genes <- gaphunter.res.genes[ !which.res.empty ]
if( length( gaphunter.res.genes ) != 0 ){
  gaps.cpgs.genes <- lapply( names( gaphunter.res.genes ), function( x ){
    cat( "   GENE: ",x, "\n")
    out <- rownames( gaphunter.res.genes[[ x ]]$sampleresults )
  
    print( out )
    cat( "\n")
    return( out )
  } )
  names( gaps.cpgs.genes ) <- names( gaphunter.res.genes )
}
##    GENE:  ABCA4 
## [1] "cg01921066" "cg04350215"
which.res.empty <- sapply( gaphunter.res.prom, function(x){
  if( is.null( x$proberesults ) ){
    return( TRUE )
  }
  return( nrow( x$proberesults ) == 0 )
} )
gaphunter.res.prom <- gaphunter.res.prom[ !which.res.empty ]
if( length( gaphunter.res.prom ) != 0 ){
  gaps.cpgs.prom <- lapply( names( gaphunter.res.prom ), function( x ){
    cat( "   GENE: ",x, "\n")
    out <- rownames( gaphunter.res.prom[[ x ]]$sampleresults )
  
    print( out )
    cat( "\n")
    return( out )
  } )
  names( gaps.cpgs.prom ) <- names( gaphunter.res.prom )
}

which.res.empty <- sapply( gaphunter.res.enh, function(x){
  if( is.null( x$proberesults ) ){
    return( TRUE )
  }
  return( nrow( x$proberesults ) == 0 )
} )
gaphunter.res.enh <- gaphunter.res.enh[ !which.res.empty ]
if( length( gaphunter.res.enh ) != 0 ){
  gaps.cpgs.enh <- lapply( names( gaphunter.res.enh ), function( x ){
    cat( "   GENE: ",x, "\n")
    out <- rownames( gaphunter.res.enh[[ x ]]$sampleresults )
  
    print( out )
    cat( "\n")
    return( out )
  } )
  names( gaps.cpgs.enh ) <- names( gaphunter.res.enh )
}

# create a data frame:
gaps.cpgs.genes.df <- melt( gaps.cpgs.genes )
gaps.cpgs.genes.df
##        value    L1
## 1 cg01921066 ABCA4
## 2 cg04350215 ABCA4
gaps.cpgs.all.df <- cbind( gaps.cpgs.genes.df, region = "gene" )

names.gaps.df <- c( "cpg", "gene", "region" )
names( gaps.cpgs.all.df ) <- names.gaps.df

gaps.cpgs.all.df <- arrange( gaps.cpgs.all.df, cpg )
gaps.cpgs.all.df
##          cpg  gene region
## 1 cg01921066 ABCA4   gene
## 2 cg04350215 ABCA4   gene
save( gaps.cpgs.genes, gaps.cpgs.all.df, file = "gaps_CpGs_all.RData" )

# grouping the output
gaps.cpgs.all.df <- group_by( gaps.cpgs.all.df, cpg )
gaps.cpgs.all.df
## # A tibble: 2 x 3
## # Groups:   cpg [2]
##   cpg        gene  region
##   <fct>      <chr> <fct> 
## 1 cg01921066 ABCA4 gene  
## 2 cg04350215 ABCA4 gene
cpg.all <- read.csv( "../../ORIG_DATA/cpg_info_all.csv", stringsAsFactors = FALSE )
snps.Iowa.all <- read.table( "../unique_SNPs.dat", stringsAsFactors = FALSE )
# genetic data - checking whether I have SNPs within the probes found
#   to be bimodal by gaphunter

# for each CpG
unique.gaps.cpgs <- unique( gaps.cpgs.all.df$cpg )
for( cur.cpg in unique.gaps.cpgs ){
  cat( "\n" )
  cat( "CURRENT CpG: ", cur.cpg, "\n" )
  cur.regions <- filter( gaps.cpgs.all.df, cpg == cur.cpg ) %>%
    select( cpg, region ) %>% distinct()
  # info about the curent CpG and the probe
  cur.probe.snps <- filter( cpg.all, TargetID == cur.cpg ) %>%
    select( TargetID, PROBE_SNPS )
  
  if( cur.probe.snps$PROBE_SNPS != "" ){
    cur.probe.snps <- cur.probe.snps$PROBE_SNPS
    existing.snps <- cur.probe.snps[ cur.probe.snps %in% snps.Iowa.all ]
    
    if( length( existing.snps ) != 0 ){
      cat( " SNPs found: ", existing.snps, "\n" )
      # for each SNP
      # for( cur.snp in existing.snps ){
      #   cat( "   CURRENT SNP: ", cur.snp, "\n" )
      #   
      #   haplin.data.Iowa <- genDataLoad( file.path( "..", ffdata.file.names[ chosen.cleft ] ) )
      #   
      #   ids.final <- read.table( file = file.path( "..", file.names[[ chosen.cleft ]] ),
      #       stringsAsFactors = FALSE, header = TRUE )
      #   
      #   which.cur.snp <- match( cur.snp, as.character( map.ok$snpname ) )
      # 
      #   haplin.data.cur.SNP <- genDataGetPart( haplin.data.Iowa, design = "triad",
      #     markers = which.cur.snp, overwrite = TRUE )
      #   haplin.data.cur.SNP
      #   
      #   if( "gene" %in% cur.regions ){
      #     betas.df.subgroup.tmp <- filter( 
      #       betas.df.subgroup.gene.list[[ chosen.cleft ]], CpG == cur.cpg )
      #   } else if ( "promoter" %in% cur.regions ){
      #     betas.df.subgroup.tmp <- filter( 
      #       betas.df.subgroup.promoter.list[[ chosen.cleft ]], CpG == cur.cpg )
      #   } else {
      #     betas.df.subgroup.tmp <- filter( 
      #       betas.df.subgroup.enhancer.list[[ chosen.cleft ]], CpG == cur.cpg )
      #   }
      #   which.indiv.cur.cpg <- betas.df.subgroup.tmp$indiv
      #   
      #   unique.indiv <- unique( which.indiv.cur.cpg )
      #   fam.id <- as.character( sapply( unique.indiv, function(x){
      #     cur.row <- ids.final.cl[ ids.final.cl$ID.BSI == x, ]
      #     ids3.cl$cidr3[ cur.row$key2.fam ]
      #   } ) )
      #   
      #   file.subset.haplin <- paste( "CL_subset", cur.snp, cur.cpg, sep = "_" )
      #   haplin.subset.data <- genDataGetPart( data.in = haplin.data.cur.SNP, design = "triad",
      #     indiv.ids = fam.id, file.out = file.subset.haplin, overwrite = TRUE )
      # 
      #   cat( "      ALLELE TABLE:\n" )
      #   allele.tab.cur.snp <- table( haplin.subset.data$gen.data[[1]][,] )
      #   allele.tab.cur.snp
      #   
      #   allele.tab.cur.snp[ 1 ]/sum( allele.tab.cur.snp )
      #   allele.tab.cur.snp[ 2 ]/sum( allele.tab.cur.snp )
      #   cat( "\n" )
      # 
      #   # now, I need to construct a data.frame with the indiv.id, allele and methyl.level:
      #   SNP.CpG.df <- data.frame( indiv.id = unique.indiv, fam.id = fam.id,
      #     cur.snp = paste0(
      #       as.character( haplin.subset.data$gen.data[[1]][,1] ),
      #       as.character( haplin.subset.data$gen.data[[1]][,2] ) ),
      #     cur.cpg = betas.df.subgroup.tmp$beta
      #   )
      #   SNP.CpG.df
      #   
      #   ggplot( SNP.CpG.df, aes( as.factor( cur.snp ), cur.cpg ) ) +
      #     geom_jitter( aes( colour = as.factor( cur.snp ) ) ) +
      #     scale_color_discrete( "allele" ) +
      #     xlab( paste( "SNP", cur.snp ) ) +
      #     ylab( paste( "CpG", cur.cpg ) )
      # }
    } else {
      message( "No probe SNPs in our data!" )
    }
  } else {
      # maybe there are SNPs not in the manifest but in our data?
      cur.cpg.pos <- filter( cpg.all, TargetID == cur.cpg ) %>%
        select( TargetID, MAPINFO, CHR, STRAND )
      cur.range <- cur.cpg.pos$MAPINFO
      if( cur.cpg.pos$STRAND == "F" ){
        # forward strand
        cur.range <- c( cur.range, cur.range + 50 )
      } else {
        # reverse strand
        cur.range <- c( cur.range - 50, cur.range )
      }

    message( "No probe SNPS for ", cur.cpg )
  }
}
## 
## CURRENT CpG:  cg01921066
## No probe SNPS for cg01921066
## 
## CURRENT CpG:  cg04350215
## No probe SNPS for cg04350215
# plotting only the gaps CpGs
library( ggExtra )

# gather all the data for those CpGs
chosen.gene <- unique( gaps.cpgs.all.df$gene )
bvals.plotting.list <- lapply( names( betas.df.subgroup.gene.list ), function(x){
  cur.df <- betas.df.subgroup.gene.list[[ x ]]
  out <- filter( cur.df, name == chosen.gene & CpG %in% gaps.cpgs.all.df$cpg )
  return( cbind( out, cleft_type = x ) )
})
sapply( bvals.plotting.list, dim )
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  206  538  280  332  912
## [2,]   10   10   10   10   10
bvals.plotting.df <- do.call( bind_rows, bvals.plotting.list )
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
ggplot( bvals.plotting.df, aes( x= cleft_type, y = beta ) ) +
  geom_jitter() +
  facet_wrap( .~CpG )

# plotting to check the correlation with the SNP we're investigating
bvals.plotting.df$allele1 <- ""
bvals.plotting.df$allele2 <- ""
gaps.cpgs.genotypes.list <- list()
for( chosen.cleft in cleft.types ){
  cat( "\n\nCLEFT TYPE: ", chosen.cleft, "\n")
  haplin.data.Iowa <- genDataLoad( file.path( "..", ffdata.file.names[ chosen.cleft ] ) )

  ids.final <- read.table( file = file.path( "..", file.names[[ chosen.cleft ]] ),
      stringsAsFactors = FALSE, header = TRUE )

  which.cur.snp <- match( chosen.gene, as.character( map.ok$gene_name ) )

  haplin.data.cur.SNP <- genDataGetPart( haplin.data.Iowa, design = "triad",
    markers = which.cur.snp, overwrite = TRUE )
  haplin.data.cur.SNP

  cur.cpg <- as.character( gaps.cpgs.all.df$cpg[1] ) # the individuals are the same, irrespective of CpG
  betas.df.subgroup.tmp <- filter(
    betas.df.subgroup.gene.list[[ chosen.cleft ]], as.character( CpG ) == cur.cpg )
  which.indiv.cur.cpg <- betas.df.subgroup.tmp$indiv

  unique.indiv <- unique( which.indiv.cur.cpg )
  fam.id <- as.character( sapply( unique.indiv, function(x){
    if( sum( ids.final$ID.BSI == x ) != 0 ){
        ids.final$ID.N2[ ids.final$ID.BSI == x ]
    }
  } ) )
  chosen.indiv <- haplin.data.cur.SNP$cov.data[ match( fam.id,
    as.character( haplin.data.cur.SNP$cov.data[ ,"id" ] ) ), "id" ]
  which.null <- which( chosen.indiv == "0" )
  if( length( which.null ) != 0 ){
    chosen.indiv <- chosen.indiv[ -which.null ]
  }

  file.subset.haplin <- "subset1"
  haplin.subset.data <- genDataGetPart( data.in = haplin.data.cur.SNP, design = "triad",
    id = chosen.indiv, file.out = file.subset.haplin, overwrite = TRUE )
  
  gaps.cpgs.genotypes.list <- c( gaps.cpgs.genotypes.list, list( haplin.subset.data ) )

  cat( "      ALLELE TABLE:\n" )
  allele.tab.cur.snp <- table( haplin.subset.data$gen.data[[1]][,] )
  allele.tab.cur.snp

  allele.tab.cur.snp[ 1 ]/sum( allele.tab.cur.snp )
  allele.tab.cur.snp[ 2 ]/sum( allele.tab.cur.snp )
  cat( "\n" )

  # now, I need to construct a data.frame with the indiv.id, allele and methyl.level:
  bvals.plotting.df$allele1[ 
    with( bvals.plotting.df, indiv %in% which.indiv.cur.cpg )
    ] <- as.character( haplin.subset.data$gen.data[[1]][,1] )
  bvals.plotting.df$allele2[ 
    with( bvals.plotting.df, indiv %in% which.indiv.cur.cpg )
    ] <- as.character( haplin.subset.data$gen.data[[1]][,2] )
}
## 
## 
## CLEFT TYPE:  CLO 
## The output file(s) exist! 
## Provided arguments:
##  --- chosen markers: 2 
## INFO: Will select 1217 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/Rtmpq6iF6H/ff4c423697037.ff
## Saving data... 
## ... saved to files: ./my_data_part_gen.ffData, ./my_data_part_gen.RData
## The output file(s) exist! 
## Provided arguments:
##  --- argument: id, chosen values: 11, 23, 27, 47, 56, 70 ... 666, 668, 669, 674, 676, 678
## INFO: Will select 103 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc46d57b48.ff
## Saving data... 
## ... saved to files: ./subset1_gen.ffData, ./subset1_gen.RData
##       ALLELE TABLE:
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc42e940b2.ff
## 
## 
## 
## CLEFT TYPE:  CL/P 
## The output file(s) exist! 
## Provided arguments:
##  --- chosen markers: 2 
## INFO: Will select 3447 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/Rtmpq6iF6H/ff4c4180052a6.ff
## Saving data... 
## ... saved to files: ./my_data_part_gen.ffData, ./my_data_part_gen.RData
## The output file(s) exist! 
## Provided arguments:
##  --- argument: id, chosen values: 1, 5, 8, 11, 13, 15 ... 675, 676, 678, 682, 683, 684
## INFO: Will select 269 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc41adf11ce.ff
## Saving data... 
## ... saved to files: ./subset1_gen.ffData, ./subset1_gen.RData
##       ALLELE TABLE:
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc41b343ea0.ff
## 
## 
## 
## CLEFT TYPE:  CPO 
## The output file(s) exist! 
## Provided arguments:
##  --- chosen markers: 2 
## INFO: Will select 1829 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/Rtmpq6iF6H/ff4c44a7ad35.ff
## Saving data... 
## ... saved to files: ./my_data_part_gen.ffData, ./my_data_part_gen.RData
## The output file(s) exist! 
## Provided arguments:
##  --- argument: id, chosen values: 2, 3, 16, 24, 28, 30 ... 616, 626, 628, 630, 637, 687
## INFO: Will select 140 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc45e21c47.ff
## Saving data... 
## ... saved to files: ./subset1_gen.ffData, ./subset1_gen.RData
##       ALLELE TABLE:
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc49cd503b.ff
## 
## 
## 
## CLEFT TYPE:  CLP 
## The output file(s) exist! 
## Provided arguments:
##  --- chosen markers: 2 
## INFO: Will select 2230 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/Rtmpq6iF6H/ff4c44ce71911.ff
## Saving data... 
## ... saved to files: ./my_data_part_gen.ffData, ./my_data_part_gen.RData
## The output file(s) exist! 
## Provided arguments:
##  --- argument: id, chosen values: 1, 5, 8, 13, 15, 19 ... 671, 672, 675, 682, 683, 684
## INFO: Will select 166 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc47ec250d3.ff
## Saving data... 
## ... saved to files: ./subset1_gen.ffData, ./subset1_gen.RData
##       ALLELE TABLE:
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc4a9df0a.ff
## 
## 
## 
## CLEFT TYPE:  cntrl 
## The output file(s) exist! 
## Provided arguments:
##  --- chosen markers: 2 
## INFO: Will select 11461 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/Rtmpq6iF6H/ff4c43ff22ad9.ff
## Saving data... 
## ... saved to files: ./my_data_part_gen.ffData, ./my_data_part_gen.RData
## The output file(s) exist! 
## Provided arguments:
##  --- argument: id, chosen values: 50002, 50004, 50005, 50006, 50008, 50010 ... 51022, 51023, 51026, 51029, 51030, 51031
## INFO: Will select 456 rows and 6 columns.
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc4330a5d82.ff
## Saving data... 
## ... saved to files: ./subset1_gen.ffData, ./subset1_gen.RData
##       ALLELE TABLE:
## opening ff C:/Users/jro049/AppData/Local/Temp/RtmpcB0PXJ/ff1fc42cf068c9.ff
names( gaps.cpgs.genotypes.list ) <- cleft.types

table( bvals.plotting.df$allele1 )
## 
##    A    G 
## 1600  435
table( bvals.plotting.df$allele2 )
## 
##    A    G 
##  607 1428
bvals.plotting.df$alleles <- paste0( bvals.plotting.df$allele1, bvals.plotting.df$allele2 )

ggplot( bvals.plotting.df, aes( x= 1, y = beta, color = as.factor( alleles ) ) ) +
  geom_jitter() +
  facet_grid( cleft_type~CpG )