load( "../newest_betas_prepared.RData" )
load( "../new_betas_matrices_ensembl_promoter_enhancer.RData" )
load( "../new_betas_df_subgroup_gene.RData" )
source( "../common_variables.R" )

library( ggplot2 )

betas.sum.by.enhancer <- lapply( betas.matrix.enhancer.list, function(x){
    lapply( x, function(y){ 
      rowSums( y )/ncol( y )
    } )
} )
betas.sum.by.promoter <- lapply( betas.matrix.promoter.list, function(x){
    lapply( x, function(y){ 
      rowSums( y )/ncol( y )
    } )
} )
betas.sum.by.gene <- lapply( betas.matrix.gene.list, function(x){
    lapply( x, function(y){ 
      rowSums( y )/ncol( y )
    } )
} )

# ENHANCER
betas.sum.by.enhancer.df <- lapply( 1:length( betas.sum.by.enhancer ), function( cl ){
  lapply( 1:length( betas.sum.by.enhancer[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.enhancer[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.enhancer[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, indiv = names( cur.sum ), sum.m.val = as.numeric( cur.sum ),
          labels = paste0( cur.snp, " (", cur.gene, ")" ), stringsAsFactors = FALSE )
  } )
} )
betas.sum.by.enhancer.df <- lapply( betas.sum.by.enhancer.df, function( cl ){
  do.call( rbind, cl ) } )
names( betas.sum.by.enhancer.df ) <- names( betas.sum.by.enhancer )
betas.median.by.enhancer <- lapply( 1:length( betas.sum.by.enhancer ), function(cl){
  lapply( 1:length( betas.sum.by.enhancer[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.enhancer[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.enhancer[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, labels = paste0( cur.snp,    " (", cur.gene, ")" ),
          beta.median = median( cur.sum ) )
  } )
} )
betas.median.by.enhancer <- lapply( betas.median.by.enhancer, function( cl ){ 
  do.call( rbind, cl ) } )
names( betas.median.by.enhancer ) <- names( betas.sum.by.enhancer )

for( chosen.cleft in cleft.types ){
  ggplot( betas.sum.by.enhancer.df[[ chosen.cleft ]], aes( sum.m.val ) ) +
    geom_histogram( position = "dodge" ) +
    facet_wrap( ~ labels, ncol = 3, scales = "free" ) +
    labs( x = "avg. sum of beta" ) +
    geom_vline( data = betas.median.by.enhancer[[ chosen.cleft ]], aes( xintercept = beta.median ),
        color = "red" )
  ggsave( paste0( "Avg_sum_beta_hists_enhancers_", file.names.root[ chosen.cleft ], ".jpg" ),
          dpi = 200, height = 4 )
}
## Saving 7 x 4 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 4 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 4 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 4 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 4 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# PROMOTER
betas.sum.by.promoter.df <- lapply( 1:length( betas.sum.by.promoter ), function( cl ){
  lapply( 1:length( betas.sum.by.promoter[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.promoter[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.promoter[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, indiv = names( cur.sum ), sum.m.val = as.numeric( cur.sum ),
          labels = paste0( cur.snp, " (", cur.gene, ")" ), stringsAsFactors = FALSE )
  } )
} )
betas.sum.by.promoter.df <- lapply( betas.sum.by.promoter.df, function( cl ){
  do.call( rbind, cl ) } )
names( betas.sum.by.promoter.df ) <- names( betas.sum.by.promoter )
betas.median.by.promoter <- lapply( 1:length( betas.sum.by.promoter ), function(cl){
  lapply( 1:length( betas.sum.by.promoter[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.promoter[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.promoter[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, labels = paste0( cur.snp,    " (", cur.gene, ")" ),
          beta.median = median( cur.sum ) )
  } )
} )
betas.median.by.promoter <- lapply( betas.median.by.promoter, function( cl ){ 
  do.call( rbind, cl ) } )
names( betas.median.by.promoter ) <- names( betas.sum.by.promoter )

for( chosen.cleft in cleft.types ){
  ggplot( betas.sum.by.promoter.df[[ chosen.cleft ]], aes( sum.m.val ) ) +
    geom_histogram( position = "dodge" ) + facet_wrap( ~ labels, ncol = 3, scales = "free" ) +
    labs( x = "avg. sum of beta" ) +
    geom_vline( data = betas.median.by.promoter[[ chosen.cleft ]],
       aes( xintercept = beta.median ), color = "red" )
  ggsave( paste0( "Avg_sum_beta_hists_promoters_", file.names.root[ chosen.cleft ], ".jpg" ),
    dpi = 200, width = 8.68, height = 7.6 )
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# GENE
betas.sum.by.gene.df <- lapply( 1:length( betas.sum.by.gene ), function( cl ){
  lapply( 1:length( betas.sum.by.gene[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.gene[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.gene[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, indiv = names( cur.sum ), sum.m.val = as.numeric( cur.sum ),
          labels = paste0( cur.snp, " (", cur.gene, ")" ), stringsAsFactors = FALSE )
  } )
} )
betas.sum.by.gene.df <- lapply( betas.sum.by.gene.df, function( cl ){
  do.call( rbind, cl ) } )
names( betas.sum.by.gene.df ) <- names( betas.sum.by.gene )
betas.median.by.gene <- lapply( 1:length( betas.sum.by.gene ), function(cl){
  lapply( 1:length( betas.sum.by.gene[[ cl ]] ), function(x){
      cur.gene <- names( betas.sum.by.gene[[ cl ]] )[ x ]
      cur.snp <- tolower( my.SNP.map$Marker[ my.SNP.map$gene_name == cur.gene ] )
      cur.sum <- betas.sum.by.gene[[ cl ]][[ x ]]
      data.frame( gene = cur.gene, labels = paste0( cur.snp,    " (", cur.gene, ")" ),
          beta.median = median( cur.sum ) )
  } )
} )
betas.median.by.gene <- lapply( betas.median.by.gene, function( cl ){ 
  do.call( rbind, cl ) } )
names( betas.median.by.gene ) <- names( betas.sum.by.gene )

for( chosen.cleft in cleft.types ){
  ggplot( betas.sum.by.gene.df[[ chosen.cleft ]], aes( sum.m.val ) ) +
    geom_histogram( position = "dodge" ) + facet_wrap( ~ labels, ncol = 3, scales = "free" ) +
    labs( x = "avg. sum of beta" ) +
    geom_vline( data = betas.median.by.gene[[ chosen.cleft ]],
       aes( xintercept = beta.median ), color = "red" )
  ggsave( paste0( "Avg_sum_beta_hists_genes_", file.names.root[ chosen.cleft ], ".jpg" ),
      dpi = 200 )
}
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.