check_CpGs_biomart.R

# get the CpGs around the SNPs and fetch info from ensembl
library( tidyr )
library( plyr )
library( dplyr )

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library( ff )

## Loading required package: bit

## Attaching package bit

## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)

## creators: bit bitwhich

## coercion: as.logical as.integer as.bit as.bitwhich which

## operator: ! & | xor != ==

## querying: print length any all min max range sum summary

## bit access: length<- [ [<- [[ [[<-

## for more help type ?bit

## 
## Attaching package: 'bit'

## The following object is masked from 'package:base':
## 
##     xor

## Attaching package ff

## - getOption("fftempdir")=="/tmp/RtmprTRbeP"

## - getOption("ffextension")=="ff"

## - getOption("ffdrop")==TRUE

## - getOption("fffinonexit")==TRUE

## - getOption("ffpagesize")==65536

## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes

## - getOption("ffbatchbytes")==16777216 -- consider a different value for tuning your system

## - getOption("ffmaxbytes")==536870912 -- consider a different value for tuning your system

## 
## Attaching package: 'ff'

## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list

## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2

## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered

my.wd <- getwd()
setwd( "~/REPOS/haplin_metylering/haplin/" )
source( "funk/source_JR.R" )
setwd( my.wd )

.onLoad( "Haplin", "Haplin" )

# 1. Reading in the SNPs
snps.RT <- read.table( "SNPs_from_RT.dat", stringsAsFactors = FALSE, header = TRUE )
snps.RT

##           SNP    locus   RR     X95CI    p.val
## 1    rs742071     PAX7 1.03  0.9-1.19 6.80e-01
## 2    rs560426    ABCA4 0.94 0.82-1.08 3.90e-01
## 3    rs642961     IRF6 0.92 0.77-1.09 3.30e-01
## 4   rs7590268    THADA 1.03 0.87-1.21 7.40e-01
## 5  rs12543318   8q21.3 0.98 0.85-1.13 8.00e-01
## 6    rs987525     8q24 0.87 0.73-1.04 1.20e-01
## 7   rs3758249    FOXE1 0.87 0.75-1.00 5.00e-02
## 8   rs7078160 KIAA1598 1.04 0.87-1.24 6.80e-01
## 9   rs8001641    SPRY2 1.08 0.94-1.24 3.00e-01
## 10  rs1873147     TPM1 0.90 0.76-1.05 1.80e-01
## 11   rs227731     NOG1 0.74 0.64-0.85 3.83e-05
## 12 rs13041247     MAFB 0.94 0.81-1.08 3.80e-01

load( "SNPs_from_RT_biomart_info.RData" )
head( snps.RT.info.found.df )

##   input.snp refsnp_id chr_name chrom_start chrom_end allele
## 1  rs742071  rs742071        1    18979874  18979874    G/T
## 2  rs742071  rs742071        1    18979874  18979874    G/T
## 3  rs742071  rs742071        1    18979874  18979874    G/T
## 4  rs560426  rs560426        1    94553438  94553438    C/T
## 5  rs560426  rs560426        1    94553438  94553438    C/T
## 6  rs642961  rs642961        1   209989270 209989270    A/G
##   ensembl_gene_stable_id ensembl_transcript_stable_id
## 1        ENSG00000009709              ENST00000375375
## 2        ENSG00000009709              ENST00000420770
## 3        ENSG00000009709              ENST00000400661
## 4        ENSG00000198691              ENST00000370225
## 5        ENSG00000198691              ENST00000535735
## 6                   <NA>                         <NA>
##   distance_to_transcript reg_feature_stable_id
## 1                  22374                  <NA>
## 2                  21859                  <NA>
## 3                  21856                  <NA>
## 4                  33250                  <NA>
## 5                  33243                  <NA>
## 6                     NA       ENSR00000958665

unique.idx <- match( snps.RT$SNP, snps.RT.info.found.df$refsnp_id )
my.SNP.map <- data.frame( Marker = snps.RT.info.found.df$refsnp_id[ unique.idx ],
                          chr = snps.RT.info.found.df$chr_name[ unique.idx ],
                          coordinate = snps.RT.info.found.df$chrom_start[ unique.idx ],
                          gene_name = snps.RT$locus )
my.SNP.map

##        Marker chr coordinate gene_name
## 1    rs742071   1   18979874      PAX7
## 2    rs560426   1   94553438     ABCA4
## 3    rs642961   1  209989270      IRF6
## 4   rs7590268   2   43540125     THADA
## 5  rs12543318   8   88868340    8q21.3
## 6    rs987525   8  129946154      8q24
## 7   rs3758249   9  100614140     FOXE1
## 8   rs7078160  10  118827560  KIAA1598
## 9   rs8001641  13   80692811     SPRY2
## 10  rs1873147  15   63312632      TPM1
## 11   rs227731  17   54773238      NOG1
## 12 rs13041247  20   39269074      MAFB

# 2. Gathering local CpGs (i.e., all CpGs per chromosome)
cpg.info <- read.csv( "../ORIG_DATA/cpg_info_all.csv" )

local.cpgs <- lapply( unique( my.SNP.map$chr ), function(x){
    data.frame( id = cpg.info[ cpg.info$CHR == x, "TargetID" ],
        coord = cpg.info[ cpg.info$CHR == x, "MAPINFO" ] )
} )
names( local.cpgs ) <- unique( my.SNP.map$chr )

# 3. gathering CpG sites around the SNPs
my.CpGs <- lapply( 1:nrow( my.SNP.map ), function(x){
    cur.row <- sapply( my.SNP.map[ x, ], as.character )
    out <- findCpGsnearSNP( snp = list( marker = cur.row[ "Marker" ],
        chr = as.numeric( cur.row[ "chr" ] ),
        coord = as.numeric( cur.row[ "coordinate" ] ) ),
        cpgs = local.cpgs[[ cur.row[ "chr" ] ]], range = 50000 )
    return( cbind( out, chr = as.numeric( cur.row[ "chr" ] ) ) )
} )
names( my.CpGs ) <- my.SNP.map$gene_name
sapply( my.CpGs, nrow )

##     PAX7    ABCA4     IRF6    THADA   8q21.3     8q24    FOXE1 KIAA1598 
##       65       21       60       15       17        3       23        1 
##    SPRY2     TPM1     NOG1     MAFB 
##        6       49       10       23

my.CpGs$`8q24`

##           id     coord chr
## 1 cg02905744 129995502   8
## 2 cg16318882 129912602   8
## 3 cg22586603 129985596   8

# 4. Fetching the information about the regulatory features
library( biomaRt )

ensembl.reg <- useEnsembl( "regulation", dataset = "hsapiens_regulatory_feature", GRCh = 37 )
all.attr <- listAttributes( ensembl.reg )
my.attrib.reg <- c( "regulatory_stable_id", "feature_type_name", "feature_type_description",
    "so_name", "so_accession", "chromosome_name", "chromosome_start", "chromosome_end" )
my.filters <- "chromosomal_region"

reg.info.found <- lapply( my.CpGs, function(x){
    cur.chr <- unique( x$chr )
    lapply( 1:nrow( x ), function(y){
        cur.row <- x[ y, ]
        out <- getBM( attributes = my.attrib.reg, filters = my.filters, values =
            paste0( cur.chr, ":", as.numeric( cur.row[ "coord" ] ),
                ":", as.numeric( cur.row[ "coord" ] ) + 1 ),
            mart = ensembl.reg )
        if( nrow( out ) == 0 ){
            return( NA )
        }
        return( out )
    } )
} )

# -- helper function: checking the ensembl stable id of a regulatory region
find.nearest.reg.feat <- function( reg.feat.table, snp.name, reg.feat.name ){
  if( !( snp.name %in% my.SNP.map$Marker ) ){
    stop( "No such SNP in the SNP.map!", call. = FALSE )
  }
  snp.pos <- my.SNP.map$coordinate[ my.SNP.map$Marker == snp.name ]
  tmp.df <- filter( reg.feat.table, grepl( pattern = reg.feat.name, x = so_name ) )
  if( nrow( tmp.df ) == 0 ){
    warning( "No such regulatory feature (", reg.feat.name, ", near SNP ", snp.name )
    return( tmp.df )
  }
  unique.ids <- unique( tmp.df$regulatory_stable_id )
  unique.crds <- distinct( dplyr::select( tmp.df, 
      regulatory_stable_id, chromosome_start, chromosome_end ) )
  # which one is the closest?
  cur.dist <- lapply( 1:nrow( unique.crds ), function(x){
    cur.row <- unique.crds[ x, ]
    cur.start <- cur.row$chromosome_start
    cur.end <- cur.row$chromosome_end
    # TODO
    if( cur.start <= snp.pos ){
      if( cur.end >= snp.pos ){ # the SNP is within the reg.feat.
        return( data.frame( reg.feat = cur.row$regulatory_stable_id,
          dist = 0 ) )
      }
      # the SNP is to the right of the reg.feat.
      return( data.frame( reg.feat = cur.row$regulatory_stable_id,
          dist = snp.pos - cur.end ) )
    }
    # the SNP is to the left of the reg.feat.
    return( data.frame( reg.feat = cur.row$regulatory_stable_id,
          dist = cur.start - snp.pos ) )
  } )
  cur.dist <- do.call( rbind, cur.dist )
  # return the table with CpGs within the reg.feat. that is the closest to the SNP
  closest.reg.feat <- cur.dist$reg.feat[ which.min( cur.dist$dist ) ]
  return( filter( reg.feat.table, regulatory_stable_id == closest.reg.feat ) )
}
# --
reg.info.found.list <- lapply( 1:length( reg.info.found ), function(x){
    cur.list <- reg.info.found[[ x ]]
    # finding where the CpG was _not_ in any regulatory region
    which.nas.cur.list <- sapply( cur.list, function(z){
        any( is.na( z ) )
    } )
    # removing those CpGs
    cur.list <- cur.list[ !which.nas.cur.list ]
    cur.cpg.list <- as.character( my.CpGs[[ x ]]$id[ !which.nas.cur.list ] )
    # creating a nice table for output: adding a CpG name column
    cur.list.out <- lapply( 1:length( cur.list ), function(y){
        cur.list[[ y ]] <- cbind(
            rep( cur.cpg.list[ y ], nrow( cur.list[[ y ]] ) ),
            cur.list[[ y ]] )
        return( cur.list[[ y ]] )
    } )
    cur.list.out <- do.call( rbind, cur.list.out )
    colnames( cur.list.out ) <- c( "CpG", my.attrib.reg )
    # retaining only the CpGs that are within the nearest region of each type
    promoter.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
      snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "^promoter$" )
    enhancer.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
      snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "^enhancer$" )
    
    if( nrow( promoter.tmp.df ) == 0 ){ #checking also for the less strict definition
      promoter.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
        snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "promoter" )
    }
    if( nrow( enhancer.tmp.df ) == 0 ){ #checking also for the less strict definition
      enhancer.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
        snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "enhancer" )
    }
    
    # retaining the original information
    add.df <- data.frame( orig.SNP = my.SNP.map$Marker[ x ],
       orig.gene = my.SNP.map$gene_name[ x ] )
    if( sum( nrow( enhancer.tmp.df ), nrow( promoter.tmp.df ) ) > 0 ){
      return( cbind( 
      rbind( promoter.tmp.df, enhancer.tmp.df ),
      add.df ) )
  }
    return( rbind( promoter.tmp.df, enhancer.tmp.df ) )
} )

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs560426

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs642961

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs642961

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs7590268

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs12543318

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs12543318

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs12543318

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs987525

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs3758249

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs3758249

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs7078160

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (promoter, near SNP
## rs7078160

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs8001641

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs1873147

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs1873147

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs227731

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs227731

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs227731

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs13041247

## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs13041247

names( reg.info.found.list ) <- my.SNP.map$gene_name
sapply( reg.info.found.list, dim )

##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,]   20     7   15     2      1    2     6        1     2   31    2   21
## [2,]   11    11   11    11     11   11    11       11    11   11   11   11

reg.info.found.df <- do.call( rbind, reg.info.found.list )

table( reg.info.found.df$so_name )

## 
##                 enhancer                 promoter promoter_flanking_region 
##                       11                       92                        7

save( my.SNP.map, my.CpGs, reg.info.found.list, reg.info.found.df,
  file = "reg_info_found_obj.RData" )

# 5. Fetching the information about the genes
ensembl <- useEnsembl( "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37 )
my.attrib <- c( "ensembl_gene_id", "start_position", "end_position", "external_gene_name" )
my.filters <- "chromosomal_region"

gene.info.found <- lapply( my.CpGs, function(x){
    cur.chr <- unique( x$chr )
    lapply( 1:nrow( x ), function(y){
        cur.row <- x[ y, ]
        out <- getBM( attributes = my.attrib, filters = my.filters, values =
            paste0( cur.chr, ":", as.numeric( cur.row[ "coord" ] ),
                ":", as.numeric( cur.row[ "coord" ] ) + 1 ),
            mart = ensembl )
        if( nrow( out ) == 0 ){
            return( NA )
        }
        return( out )
    } )
} )

gene.info.found.list <- lapply( 1:length( gene.info.found ), function(x){
    cur.list <- gene.info.found[[ x ]]
    which.nas.cur.list <- sapply( cur.list, function(z){
        any( is.na( z ) )
    } )
    cur.list <- cur.list[ !which.nas.cur.list ]
    if( length( cur.list ) == 0 ){
      out.df <- data.frame( t( rep( NA, length( my.attrib ) + 1 ) ) )
      names( out.df ) <- c( "CpG", my.attrib )
      return( out.df )
    }
    cur.cpg.list <- as.character( my.CpGs[[ x ]]$id[ !which.nas.cur.list ] )
    cur.list.out <- lapply( 1:length( cur.list ), function(y){
        cur.list[[ y ]] <- cbind(
            rep( cur.cpg.list[ y ], nrow( cur.list[[ y ]] ) ),
            cur.list[[ y ]] )
        return( cur.list[[ y ]] )
    } )
    cur.list.out <- do.call( rbind, cur.list.out )
    colnames( cur.list.out ) <- c( "CpG", my.attrib )
    return( cur.list.out )
} )
names( gene.info.found.list ) <- my.SNP.map$gene_name
sapply( gene.info.found.list, dim )

##      PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,]   55    16   28    15      9    1     7        1     1   42    1    9
## [2,]    5     5    5     5      5    5     5        5     5    5    5    5

gene.info.found.df <- lapply( my.SNP.map$gene_name, function(x){
  cur.df <- gene.info.found.list[[ as.character( x ) ]]
  out <- cbind( orig_gene_name = x, cur.df )
  # filter to retain only those that are within the original gene
  return( filter( out, orig_gene_name %in% external_gene_name ) )
} )
gene.info.found.df <- do.call( rbind, gene.info.found.df )
dim( gene.info.found.df )

## [1] 173   6

gene.info.found.df <- filter( gene.info.found.df, !is.na( CpG ) )
dim( gene.info.found.df )

## [1] 173   6

head( gene.info.found.df )

##   orig_gene_name        CpG ensembl_gene_id start_position end_position
## 1           PAX7 cg00065215 ENSG00000009709       18957500     19075360
## 2           PAX7 cg00855990 ENSG00000009709       18957500     19075360
## 3           PAX7 cg01797257 ENSG00000009709       18957500     19075360
## 4           PAX7 cg02178648 ENSG00000009709       18957500     19075360
## 5           PAX7 cg02856716 ENSG00000009709       18957500     19075360
## 6           PAX7 cg04094653 ENSG00000009709       18957500     19075360
##   external_gene_name
## 1               PAX7
## 2               PAX7
## 3               PAX7
## 4               PAX7
## 5               PAX7
## 6               PAX7

save( gene.info.found.list, gene.info.found.df, file = "gene_info_found_obj.RData" )

check_CpGs_biomart.R

jro049

2019-05-14