# Creating a nice overview of the annotations within the 50kbp regions of the chosen SNPs
# using the entire dataset! (i.e., not splitting into cleft-types!)
# setwd( "PLOTTING" )

load( "../new_betas_df_subgroup_gene.RData" )
load( "../new_betas_matrices_ensembl_promoter_enhancer.RData" )
load( "../newest_betas_prepared.RData" )
cpg.islands.all <- read.table( "../../ORIG_DATA/CpG_islands_hg19.dat", header = TRUE, sep = "\t" )
cpg.info <- read.csv( "../../ORIG_DATA/cpg_info_all.csv" )

cpg.info$TargetID <- as.character( cpg.info$TargetID )
rownames( cpg.info ) <- cpg.info$TargetID
cpg.info <- cpg.info[ !is.na(cpg.info$MAPINFO), ]

dim( betas.df.subgroup.gene.list[[ 1 ]] )
## [1] 16171     9
source( "clean_betas_annotation.R" )
source( "../common_variables.R" )
library( ggplot2 )
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library( dplyr )
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# col.names <- c( "name","CpG","position","indiv","indiv.ctd",
  # "beta","mvalue","ensembl_reg_feat","CGI" )

betas.df.subgroup.all <- tibble( name = "A",
                                 CpG = "cpg",
                                 position = 1,
                                 indiv = "i",
                                 indiv.ctd = "i2",
                                 beta = 0.1,
                                 mvalue = 0.1,
                                 ensembl_reg_feat = "reg",
                                 CGI = FALSE )
for( chosen.cleft in cleft.types ){
  file.out <- paste0( "annotation_overview_chosenSNPregions_",
                      file.names.root[ chosen.cleft ], ".RData" )
  clean.betas.annotation( betas.df.subgroup.gene = betas.df.subgroup.gene.list[[ chosen.cleft ]],
      betas.df.subgroup.promoter = betas.df.subgroup.promoter.list[[ chosen.cleft ]],
      betas.df.subgroup.enhancer = betas.df.subgroup.enhancer.list[[ chosen.cleft ]],
      betas.df.subgroup = betas.df.subgroup[[ chosen.cleft ]],
      cpg.islands.all, cpg.info, file.out, print = FALSE )
  
  load( paste0( "annotation_overview_chosenSNPregions_",
                        file.names.root[ chosen.cleft ], ".RData" ) )
  betas.df.subgroup.all <- union( betas.df.subgroup.all, betas.df.subgroup.new.reg.feat )
}
## Warning: Column `CpG` joining character vector and factor, coercing into
## character vector

## Warning: Column `CpG` joining character vector and factor, coercing into
## character vector

## Warning: Column `CpG` joining character vector and factor, coercing into
## character vector

## Warning: Column `CpG` joining character vector and factor, coercing into
## character vector

## Warning: Column `CpG` joining character vector and factor, coercing into
## character vector
betas.df.subgroup.all <- betas.df.subgroup.all[ -1, ]
# for ordering of CpGs with their position:
betas.df.subgroup.new.reg.feat.ordered <- betas.df.subgroup.all[
    order( betas.df.subgroup.all$name, betas.df.subgroup.all$position ), ]
betas.df.subgroup.new.reg.feat.ordered$cpg.order <- with(
    betas.df.subgroup.new.reg.feat.ordered, paste( CpG, name, sep = "." ) )
# changing the order of factor levels:
betas.df.subgroup.new.reg.feat.ordered$ensembl_reg_feat <- factor(
    betas.df.subgroup.new.reg.feat.ordered$ensembl_reg_feat,
    levels = c( "enhancer", "enhancer-gene", "gene", "promoter", "promoter-gene",
        "promoter_flank", "promoter_flank-gene", "none" ), ordered = TRUE )
# PLOTTING    
# function for extracting the text for x axis labelling
xlab.fun <- function(x){
  sapply( x, function(y){
    splitted <- unlist( strsplit( y, split = ".", fixed = TRUE ) )
    l <- length( splitted )
    last.splitted <- splitted[ l ]
    # there is one special case when the name of the locus has a dot
    if( last.splitted == "3" ){
      # return the name of cpg
      return( paste( splitted[ -( (l-1):l ) ], collapse = "." ) )
    }
    return( paste( splitted[ -l ], collapse = "." ) )
  } )
}

# facet names - SNP id and gene in parenthesis
tmp.all.snp.names <- tolower( as.character( my.SNP.map$Marker[ match(
    betas.df.subgroup.new.reg.feat.ordered$name, my.SNP.map$gene_name ) ] ) )
any( is.na( tmp.all.snp.names ) )
## [1] FALSE
tmp.all.gene.names <- as.character( betas.df.subgroup.new.reg.feat.ordered$name )
# tmp.all.gene.names[ tmp.all.gene.names == "hCG_1814486" ] <- "intergenic"
betas.df.subgroup.new.reg.feat.ordered$labels <- paste0( tmp.all.snp.names, " (",
    tmp.all.gene.names, ")" )
# plotting the beta values
p2 <- ggplot( betas.df.subgroup.new.reg.feat.ordered,
    aes( x = factor( cpg.order, levels = unique( cpg.order ) ), y = beta ) )

p2 + geom_point( aes( color = ensembl_reg_feat, shape = as.factor( CGI ) ) ) +
    facet_wrap( ~ labels, scales = "free_x", ncol = 2 ) +
    scale_color_brewer( name = "regulatory feature", type = "qual", palette = "Dark2" ) +
    scale_shape_discrete( name = "CGI" ) +
    scale_x_discrete( labels = xlab.fun ) +
    theme( legend.position = "bottom", axis.text.x = element_text( angle = 90 ) ) +
    ylab( "beta value" ) + xlab( "CpG" )

ggsave( "Overview_b-val_ensembl_reg_feat_chosenSNPregions_all.jpg",
    height = 20, width = 14, units = "in" )