# get the CpGs around the SNPs and fetch info from ensembl
library( tidyr )
library( plyr )
library( dplyr )
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library( ff )
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
##
## Attaching package: 'bit'
## The following object is masked from 'package:base':
##
## xor
## Attaching package ff
## - getOption("fftempdir")=="/tmp/RtmprTRbeP"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush" -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==16777216 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==536870912 -- consider a different value for tuning your system
##
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
##
## clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
##
## write.csv, write.csv2
## The following objects are masked from 'package:base':
##
## is.factor, is.ordered
my.wd <- getwd()
setwd( "~/REPOS/haplin_metylering/haplin/" )
source( "funk/source_JR.R" )
setwd( my.wd )
.onLoad( "Haplin", "Haplin" )
# 1. Reading in the SNPs
snps.RT <- read.table( "SNPs_from_RT.dat", stringsAsFactors = FALSE, header = TRUE )
snps.RT
## SNP locus RR X95CI p.val
## 1 rs742071 PAX7 1.03 0.9-1.19 6.80e-01
## 2 rs560426 ABCA4 0.94 0.82-1.08 3.90e-01
## 3 rs642961 IRF6 0.92 0.77-1.09 3.30e-01
## 4 rs7590268 THADA 1.03 0.87-1.21 7.40e-01
## 5 rs12543318 8q21.3 0.98 0.85-1.13 8.00e-01
## 6 rs987525 8q24 0.87 0.73-1.04 1.20e-01
## 7 rs3758249 FOXE1 0.87 0.75-1.00 5.00e-02
## 8 rs7078160 KIAA1598 1.04 0.87-1.24 6.80e-01
## 9 rs8001641 SPRY2 1.08 0.94-1.24 3.00e-01
## 10 rs1873147 TPM1 0.90 0.76-1.05 1.80e-01
## 11 rs227731 NOG1 0.74 0.64-0.85 3.83e-05
## 12 rs13041247 MAFB 0.94 0.81-1.08 3.80e-01
load( "SNPs_from_RT_biomart_info.RData" )
head( snps.RT.info.found.df )
## input.snp refsnp_id chr_name chrom_start chrom_end allele
## 1 rs742071 rs742071 1 18979874 18979874 G/T
## 2 rs742071 rs742071 1 18979874 18979874 G/T
## 3 rs742071 rs742071 1 18979874 18979874 G/T
## 4 rs560426 rs560426 1 94553438 94553438 C/T
## 5 rs560426 rs560426 1 94553438 94553438 C/T
## 6 rs642961 rs642961 1 209989270 209989270 A/G
## ensembl_gene_stable_id ensembl_transcript_stable_id
## 1 ENSG00000009709 ENST00000375375
## 2 ENSG00000009709 ENST00000420770
## 3 ENSG00000009709 ENST00000400661
## 4 ENSG00000198691 ENST00000370225
## 5 ENSG00000198691 ENST00000535735
## 6 <NA> <NA>
## distance_to_transcript reg_feature_stable_id
## 1 22374 <NA>
## 2 21859 <NA>
## 3 21856 <NA>
## 4 33250 <NA>
## 5 33243 <NA>
## 6 NA ENSR00000958665
unique.idx <- match( snps.RT$SNP, snps.RT.info.found.df$refsnp_id )
my.SNP.map <- data.frame( Marker = snps.RT.info.found.df$refsnp_id[ unique.idx ],
chr = snps.RT.info.found.df$chr_name[ unique.idx ],
coordinate = snps.RT.info.found.df$chrom_start[ unique.idx ],
gene_name = snps.RT$locus )
my.SNP.map
## Marker chr coordinate gene_name
## 1 rs742071 1 18979874 PAX7
## 2 rs560426 1 94553438 ABCA4
## 3 rs642961 1 209989270 IRF6
## 4 rs7590268 2 43540125 THADA
## 5 rs12543318 8 88868340 8q21.3
## 6 rs987525 8 129946154 8q24
## 7 rs3758249 9 100614140 FOXE1
## 8 rs7078160 10 118827560 KIAA1598
## 9 rs8001641 13 80692811 SPRY2
## 10 rs1873147 15 63312632 TPM1
## 11 rs227731 17 54773238 NOG1
## 12 rs13041247 20 39269074 MAFB
# 2. Gathering local CpGs (i.e., all CpGs per chromosome)
cpg.info <- read.csv( "../ORIG_DATA/cpg_info_all.csv" )
local.cpgs <- lapply( unique( my.SNP.map$chr ), function(x){
data.frame( id = cpg.info[ cpg.info$CHR == x, "TargetID" ],
coord = cpg.info[ cpg.info$CHR == x, "MAPINFO" ] )
} )
names( local.cpgs ) <- unique( my.SNP.map$chr )
# 3. gathering CpG sites around the SNPs
my.CpGs <- lapply( 1:nrow( my.SNP.map ), function(x){
cur.row <- sapply( my.SNP.map[ x, ], as.character )
out <- findCpGsnearSNP( snp = list( marker = cur.row[ "Marker" ],
chr = as.numeric( cur.row[ "chr" ] ),
coord = as.numeric( cur.row[ "coordinate" ] ) ),
cpgs = local.cpgs[[ cur.row[ "chr" ] ]], range = 50000 )
return( cbind( out, chr = as.numeric( cur.row[ "chr" ] ) ) )
} )
names( my.CpGs ) <- my.SNP.map$gene_name
sapply( my.CpGs, nrow )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598
## 65 21 60 15 17 3 23 1
## SPRY2 TPM1 NOG1 MAFB
## 6 49 10 23
my.CpGs$`8q24`
## id coord chr
## 1 cg02905744 129995502 8
## 2 cg16318882 129912602 8
## 3 cg22586603 129985596 8
# 4. Fetching the information about the regulatory features
library( biomaRt )
ensembl.reg <- useEnsembl( "regulation", dataset = "hsapiens_regulatory_feature", GRCh = 37 )
all.attr <- listAttributes( ensembl.reg )
my.attrib.reg <- c( "regulatory_stable_id", "feature_type_name", "feature_type_description",
"so_name", "so_accession", "chromosome_name", "chromosome_start", "chromosome_end" )
my.filters <- "chromosomal_region"
reg.info.found <- lapply( my.CpGs, function(x){
cur.chr <- unique( x$chr )
lapply( 1:nrow( x ), function(y){
cur.row <- x[ y, ]
out <- getBM( attributes = my.attrib.reg, filters = my.filters, values =
paste0( cur.chr, ":", as.numeric( cur.row[ "coord" ] ),
":", as.numeric( cur.row[ "coord" ] ) + 1 ),
mart = ensembl.reg )
if( nrow( out ) == 0 ){
return( NA )
}
return( out )
} )
} )
# -- helper function: checking the ensembl stable id of a regulatory region
find.nearest.reg.feat <- function( reg.feat.table, snp.name, reg.feat.name ){
if( !( snp.name %in% my.SNP.map$Marker ) ){
stop( "No such SNP in the SNP.map!", call. = FALSE )
}
snp.pos <- my.SNP.map$coordinate[ my.SNP.map$Marker == snp.name ]
tmp.df <- filter( reg.feat.table, grepl( pattern = reg.feat.name, x = so_name ) )
if( nrow( tmp.df ) == 0 ){
warning( "No such regulatory feature (", reg.feat.name, ", near SNP ", snp.name )
return( tmp.df )
}
unique.ids <- unique( tmp.df$regulatory_stable_id )
unique.crds <- distinct( dplyr::select( tmp.df,
regulatory_stable_id, chromosome_start, chromosome_end ) )
# which one is the closest?
cur.dist <- lapply( 1:nrow( unique.crds ), function(x){
cur.row <- unique.crds[ x, ]
cur.start <- cur.row$chromosome_start
cur.end <- cur.row$chromosome_end
# TODO
if( cur.start <= snp.pos ){
if( cur.end >= snp.pos ){ # the SNP is within the reg.feat.
return( data.frame( reg.feat = cur.row$regulatory_stable_id,
dist = 0 ) )
}
# the SNP is to the right of the reg.feat.
return( data.frame( reg.feat = cur.row$regulatory_stable_id,
dist = snp.pos - cur.end ) )
}
# the SNP is to the left of the reg.feat.
return( data.frame( reg.feat = cur.row$regulatory_stable_id,
dist = cur.start - snp.pos ) )
} )
cur.dist <- do.call( rbind, cur.dist )
# return the table with CpGs within the reg.feat. that is the closest to the SNP
closest.reg.feat <- cur.dist$reg.feat[ which.min( cur.dist$dist ) ]
return( filter( reg.feat.table, regulatory_stable_id == closest.reg.feat ) )
}
# --
reg.info.found.list <- lapply( 1:length( reg.info.found ), function(x){
cur.list <- reg.info.found[[ x ]]
# finding where the CpG was _not_ in any regulatory region
which.nas.cur.list <- sapply( cur.list, function(z){
any( is.na( z ) )
} )
# removing those CpGs
cur.list <- cur.list[ !which.nas.cur.list ]
cur.cpg.list <- as.character( my.CpGs[[ x ]]$id[ !which.nas.cur.list ] )
# creating a nice table for output: adding a CpG name column
cur.list.out <- lapply( 1:length( cur.list ), function(y){
cur.list[[ y ]] <- cbind(
rep( cur.cpg.list[ y ], nrow( cur.list[[ y ]] ) ),
cur.list[[ y ]] )
return( cur.list[[ y ]] )
} )
cur.list.out <- do.call( rbind, cur.list.out )
colnames( cur.list.out ) <- c( "CpG", my.attrib.reg )
# retaining only the CpGs that are within the nearest region of each type
promoter.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "^promoter$" )
enhancer.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "^enhancer$" )
if( nrow( promoter.tmp.df ) == 0 ){ #checking also for the less strict definition
promoter.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "promoter" )
}
if( nrow( enhancer.tmp.df ) == 0 ){ #checking also for the less strict definition
enhancer.tmp.df <- find.nearest.reg.feat( reg.feat.table = cur.list.out,
snp.name = my.SNP.map$Marker[ x ], reg.feat.name = "enhancer" )
}
# retaining the original information
add.df <- data.frame( orig.SNP = my.SNP.map$Marker[ x ],
orig.gene = my.SNP.map$gene_name[ x ] )
if( sum( nrow( enhancer.tmp.df ), nrow( promoter.tmp.df ) ) > 0 ){
return( cbind(
rbind( promoter.tmp.df, enhancer.tmp.df ),
add.df ) )
}
return( rbind( promoter.tmp.df, enhancer.tmp.df ) )
} )
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs560426
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs642961
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs642961
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs7590268
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs12543318
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs12543318
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs12543318
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs987525
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs3758249
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs3758249
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs7078160
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (promoter, near SNP
## rs7078160
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs8001641
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs1873147
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs1873147
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^promoter$, near SNP
## rs227731
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs227731
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs227731
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (^enhancer$, near SNP
## rs13041247
## Warning in find.nearest.reg.feat(reg.feat.table = cur.list.out, snp.name
## = my.SNP.map$Marker[x], : No such regulatory feature (enhancer, near SNP
## rs13041247
names( reg.info.found.list ) <- my.SNP.map$gene_name
sapply( reg.info.found.list, dim )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,] 20 7 15 2 1 2 6 1 2 31 2 21
## [2,] 11 11 11 11 11 11 11 11 11 11 11 11
reg.info.found.df <- do.call( rbind, reg.info.found.list )
table( reg.info.found.df$so_name )
##
## enhancer promoter promoter_flanking_region
## 11 92 7
save( my.SNP.map, my.CpGs, reg.info.found.list, reg.info.found.df,
file = "reg_info_found_obj.RData" )
# 5. Fetching the information about the genes
ensembl <- useEnsembl( "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37 )
my.attrib <- c( "ensembl_gene_id", "start_position", "end_position", "external_gene_name" )
my.filters <- "chromosomal_region"
gene.info.found <- lapply( my.CpGs, function(x){
cur.chr <- unique( x$chr )
lapply( 1:nrow( x ), function(y){
cur.row <- x[ y, ]
out <- getBM( attributes = my.attrib, filters = my.filters, values =
paste0( cur.chr, ":", as.numeric( cur.row[ "coord" ] ),
":", as.numeric( cur.row[ "coord" ] ) + 1 ),
mart = ensembl )
if( nrow( out ) == 0 ){
return( NA )
}
return( out )
} )
} )
gene.info.found.list <- lapply( 1:length( gene.info.found ), function(x){
cur.list <- gene.info.found[[ x ]]
which.nas.cur.list <- sapply( cur.list, function(z){
any( is.na( z ) )
} )
cur.list <- cur.list[ !which.nas.cur.list ]
if( length( cur.list ) == 0 ){
out.df <- data.frame( t( rep( NA, length( my.attrib ) + 1 ) ) )
names( out.df ) <- c( "CpG", my.attrib )
return( out.df )
}
cur.cpg.list <- as.character( my.CpGs[[ x ]]$id[ !which.nas.cur.list ] )
cur.list.out <- lapply( 1:length( cur.list ), function(y){
cur.list[[ y ]] <- cbind(
rep( cur.cpg.list[ y ], nrow( cur.list[[ y ]] ) ),
cur.list[[ y ]] )
return( cur.list[[ y ]] )
} )
cur.list.out <- do.call( rbind, cur.list.out )
colnames( cur.list.out ) <- c( "CpG", my.attrib )
return( cur.list.out )
} )
names( gene.info.found.list ) <- my.SNP.map$gene_name
sapply( gene.info.found.list, dim )
## PAX7 ABCA4 IRF6 THADA 8q21.3 8q24 FOXE1 KIAA1598 SPRY2 TPM1 NOG1 MAFB
## [1,] 55 16 28 15 9 1 7 1 1 42 1 9
## [2,] 5 5 5 5 5 5 5 5 5 5 5 5
gene.info.found.df <- lapply( my.SNP.map$gene_name, function(x){
cur.df <- gene.info.found.list[[ as.character( x ) ]]
out <- cbind( orig_gene_name = x, cur.df )
# filter to retain only those that are within the original gene
return( filter( out, orig_gene_name %in% external_gene_name ) )
} )
gene.info.found.df <- do.call( rbind, gene.info.found.df )
dim( gene.info.found.df )
## [1] 173 6
gene.info.found.df <- filter( gene.info.found.df, !is.na( CpG ) )
dim( gene.info.found.df )
## [1] 173 6
head( gene.info.found.df )
## orig_gene_name CpG ensembl_gene_id start_position end_position
## 1 PAX7 cg00065215 ENSG00000009709 18957500 19075360
## 2 PAX7 cg00855990 ENSG00000009709 18957500 19075360
## 3 PAX7 cg01797257 ENSG00000009709 18957500 19075360
## 4 PAX7 cg02178648 ENSG00000009709 18957500 19075360
## 5 PAX7 cg02856716 ENSG00000009709 18957500 19075360
## 6 PAX7 cg04094653 ENSG00000009709 18957500 19075360
## external_gene_name
## 1 PAX7
## 2 PAX7
## 3 PAX7
## 4 PAX7
## 5 PAX7
## 6 PAX7
save( gene.info.found.list, gene.info.found.df, file = "gene_info_found_obj.RData" )