Additionally, see CLIPflexR documentation for:
fetchSequencesForCLIP
annotatePeaksWithPatterns
fetchSequencesForClIP <- function(peaks,reSize=NULL,fasta,add5=0,add3=0){
swd <- Rsamtools::FaFile(fasta)
fastaLens <- swd %>% seqlengths
start(peaks) <- start(peaks)-add5
end(peaks) <- end(peaks)+add3
sees <- seqlevels(peaks)
seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
if(!is.null(reSize)){
resizePeaks <- resize(peaks, reSize, fix="center", use.names=TRUE)
}else{
resizePeaks <- peaks
}
names(resizePeaks) <- paste0(seqnames(resizePeaks),":",start(resizePeaks),"-",end(resizePeaks))
resizePeaks <- trim(resizePeaks)
temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
temp2 <- Rsamtools::getSeq(Rsamtools:::FaFile(fasta),resizePeaks[temp@from])
names(temp2) <- names(resizePeaks[temp@from])
temp2
}
annotatePeaksWithPatterns <- function(peaks,fasta,patterns,resize=64,add5=0,add3=0,verbose=FALSE,checkReverse=TRUE){
require(GenomicRanges)
require(magrittr)
require(Biostrings)
require(Rsamtools)
if(verbose) message("Reading in peaks....",appendLF = FALSE)
if(is(peaks,"GRanges")){
peaks <- peaks
peaks$originalPeak <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
}else if(is(peaks,"character")){
if(!file.exists(peaks))stop(paste0("The file ",peaks," does not exist"))
peaks <- read.table(peaks, header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks,
keep.extra.columns=TRUE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
peaks$originalPeak <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), ":", strand(peaks))
}
if(verbose) message("...done")
if(verbose) message("Read in ",length(peaks)," peaks")
if(!file.exists(fasta))stop(paste0("The file ",fasta," does not exist"))
if(verbose) message("Indexing FASTA...",appendLF = FALSE)
indexFa(fasta)
if(verbose) message("...done")
if(verbose) message("Aligning seqlevels across peaks and FASTA...",appendLF = FALSE)
fastaLens <- FaFile(fasta) %>% seqlengths
sees <- seqlevels(peaks)
seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
if(verbose) message("...done")
if(verbose) message("Removing invalid peaks which are outside contig boundaries...",appendLF = FALSE)
Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
resizePeaks <- resize(peaks, resize, fix="center", use.names=TRUE)
resizePeaks <- trim(resizePeaks)
names(resizePeaks) <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
validPeaks <- resizePeaks[temp@from]
validPeaks$extendedPeak <- paste0(seqnames(validPeaks),":",start(validPeaks),"_",end(validPeaks), "_", strand(validPeaks))
if(verbose) message("...done")
if(verbose) message("Removed in ",length(validPeaks)-length(resizePeaks)," peaks")
if(verbose) message("Retrieving sequence from under peaks....",appendLF = FALSE)
myRes <- getSeq(Rsamtools:::FaFile(fasta),validPeaks)
names(myRes) <- validPeaks$extendedPeak
validPeaks$seqUnderExtendedPeak <- myRes
if(verbose) message("...done")
if(verbose) message("Retrieving patterns to search for....",appendLF = FALSE)
if(length(patterns) > 1){
pattern=patterns
}else{
motifSeq <- readDNAStringSet(patterns)
pattern <- as.character(motifSeq)
}
if(verbose) message("...done")
if(verbose) message("Read in ",length(pattern)," patterns")
if(verbose) message("Searching for ",length(pattern)," patterns in peaks....")
mergePeaksAndSites <- patternCallList <- list()
for(i in 1:length(pattern)){
if(verbose) message("Search for ",pattern[i])
peaks_Sites <- validPeaks
fixedInRegion <- vmatchPattern(pattern=pattern[i],subject=myRes) %>% unlist()
startOfPmatch <- resize(fixedInRegion,width=1,fix="start") %>% unname %>% as.character
pos <- matrix(unlist(strsplit(gsub(".*:","",names(fixedInRegion)),"_")),ncol=3,byrow =T)
seq <- gsub(":.*","",names(fixedInRegion))
sitesGRpos <- GRanges(seq[pos[,3] =="+"],
IRanges(as.numeric(pos[pos[,3] =="+",1])+start(fixedInRegion[pos[,3] =="+"])-1,
as.numeric(pos[pos[,3] =="+",1])+end(fixedInRegion[pos[,3] =="+"])-1),startOfPmatch=startOfPmatch[pos[,3] =="+"])
sitesGRpos$PeakID <- names(fixedInRegion[pos[,3] =="+"])
sitesGRneg <- GRanges(seq[pos[,3] =="-"],
IRanges(as.numeric(pos[pos[,3] =="-",2])-end(fixedInRegion[pos[,3] =="-"])+1,
as.numeric(pos[pos[,3] =="-",2])-start(fixedInRegion[pos[,3] =="-"])+1),startOfPmatch=startOfPmatch[pos[,3] =="-"])
sitesGRneg$PeakID <- names(fixedInRegion[pos[,3] =="-"])
sitesFA <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta),reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta)))
sitesFAExtended <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta,add5=add5,add3=add3),
reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta,add5=add5,add3=add3))
)
sitesGR <- suppressWarnings(c(sitesGRpos,sitesGRneg))
sitesGR$Seq <- sitesFA
sitesGR$SeqExtended <- sitesFAExtended
sitesGR$siteOfPattern <- granges(sitesGR) %>% unname %>% as.character
sitesGR$centerOfPattern <- resize(sitesGR,width=1,fix="center") %>% unname %>% as.character
dfSitesGR <- as.data.frame(sitesGR)
peaks_SitesDF <- as.data.frame(peaks_Sites)
mergePeaksAndSites[[i]] <- merge(peaks_SitesDF,dfSitesGR,by.x="extendedPeak",by.y="PeakID",all=FALSE)
patternCallList[[i]] <- mergePeaksAndSites[[i]] %>% dplyr::select(Seq,SeqExtended,startOfPmatch,centerOfPattern,siteOfPattern,originalPeak,extendedPeak)
rm(peaks_SitesDF)
rm(peaks_Sites)
}
if(verbose) message("....finished search")
names(mergePeaksAndSites) <- names(patternCallList) <- as.character(pattern)
return(list(mergePeaksAndSites,patternCallList))
}
read in all peaks from matrix building and PCA filtering output; input file “Final_matrix_all_peaks_add_filtering.txt” is peak matrix with all sample and filtering information, generated in PCA_annotation_fgsea_targetvenns script, and available in my Github
#read in all peaks, with filtering stats/etc added from PCA output, available on my github
all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE)
#make GR for seed search
allGR <- makeGRangesFromDataFrame(all,
keep.extra.columns=TRUE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
allctrlGR <- GenomicRanges::shift(allGR, 4000) #note, annotations are no longer valid
“top10_aegypti_Ago1_known_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github
“Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa” is available for download at Vectorbase
Additionally, see CLIPflexR documentation for:
annotatePeaksWithPatterns
#all peak ranges
res <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_aegypti_Ago1_known_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Loading required package: Rsamtools
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ACTGCC
## Search for AGTATT
## Search for ATACTC
## Search for CATTCC
## Search for CCGTCC
## Search for CTCTCT
## Search for GATCTC
## Search for GCATTT
## Search for GTGTTC
## Search for TGTGAT
## ....finished search
res <- res[[1]]
res <- rbindlist(res)
#make sure are only counting pattern matches 1X
#could have dups if extended sequence of one peak overlaps with the extended peak of another sequence and then a match is in both;
#if site of pattern and extended seq around pattern match is the same, randomly remove duplicates
res$group <- paste0(res$SeqExtended, "_", res$siteOfPattern)
res$startOfPmatch <- as.numeric(res$startOfPmatch)
res$filt <- duplicated(res$group)
res_Ago1_known <- subset(res, res$filt == FALSE)
#control ranges
res_Ago1_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_aegypti_Ago1_known_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ACTGCC
## Search for AGTATT
## Search for ATACTC
## Search for CATTCC
## Search for CCGTCC
## Search for CTCTCT
## Search for GATCTC
## Search for GCATTT
## Search for GTGTTC
## Search for TGTGAT
## ....finished search
res_Ago1_known_ctrl <- res_Ago1_known_ctrl[[1]]
res_Ago1_known_ctrl <- rbindlist(res_Ago1_known_ctrl)
res_Ago1_known_ctrl$group <- paste0(res_Ago1_known_ctrl$SeqExtended, "_", res_Ago1_known_ctrl$siteOfPattern)
length(unique(res_Ago1_known_ctrl$group)) #62644
## [1] 61733
res_Ago1_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_known_ctrl$startOfPmatch)
res_Ago1_known_ctrl$filt <- duplicated(res_Ago1_known_ctrl$group)
res_Ago1_known_ctrl <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$filt == FALSE)
#get GRs with the same peak subset for calculating specific coverage
Ago1_aegypti_ranges <- subset(res_Ago1_known, res_Ago1_known$aegypti_Ago1_BCsample > 0 & res_Ago1_known$aegypti_Ago2_BCsample ==0 )
Ago1_aegypti_ctrl_ranges <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_known_ctrl$aegypti_Ago2_BCsample == 0)
#subset matched patterns in peaks with support from aegypti Ago1 reads
Ago1_aegypti_seed_known_ctrl <- subset(res_Ago1_known_ctrl$startOfPmatch, res_Ago1_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_known_ctrl$aegypti_Ago2_BCsample == 0)
Ago1_aegypti_seed_known <- subset(res_Ago1_known$startOfPmatch, res_Ago1_known$aegypti_Ago1_BCsample > 0 & res_Ago1_known$aegypti_Ago2_BCsample ==0 )
“top10_aegypti_Ago1_known_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github
#all peak ranges
res <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_cells_Ago1_known_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ATACTC
## Search for CCGTCC
## Search for CTAGTC
## Search for GAGATT
## Search for GATATT
## Search for GATCTC
## Search for GTACAA
## Search for GTGTTC
## Search for TACCTG
## Search for TGTGAT
## ....finished search
res <- res[[1]]
res <- rbindlist(res)
res$group <- paste0(res$SeqExtended, "_", res$siteOfPattern)
res$startOfPmatch <- as.numeric(res$startOfPmatch)
res$filt <- duplicated(res$group)
res_Ago1_known <- subset(res, res$filt == FALSE)
#control ranges
res_Ago1_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_cells_Ago1_known_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ATACTC
## Search for CCGTCC
## Search for CTAGTC
## Search for GAGATT
## Search for GATATT
## Search for GATCTC
## Search for GTACAA
## Search for GTGTTC
## Search for TACCTG
## Search for TGTGAT
## ....finished search
res_Ago1_known_ctrl <- res_Ago1_known_ctrl[[1]]
res_Ago1_known_ctrl <- rbindlist(res_Ago1_known_ctrl)
res_Ago1_known_ctrl$group <- paste0(res_Ago1_known_ctrl$SeqExtended, "_", res_Ago1_known_ctrl$siteOfPattern)
res_Ago1_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_known_ctrl$startOfPmatch)
res_Ago1_known_ctrl$filt <- duplicated(res_Ago1_known_ctrl$group)
res_Ago1_known_ctrl <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$filt == FALSE)
#get GRs with the same peak subset for calculating specific coverage
Ago1_Aag2_ranges <- subset(res_Ago1_known, res_Ago1_known$Aag2_Ago1_BCsample > 0 & res_Ago1_known$Aag2_Ago2_BCsample ==0 )
Ago1_Aag2_ctrl_ranges <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_known_ctrl$Aag2_Ago2_BCsample ==0 )
#subset matched patterns in peaks with support from Aag2 Ago1 reads
Ago1_Aag2_seed_known_ctrl <- subset(res_Ago1_known_ctrl$startOfPmatch, res_Ago1_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_known_ctrl$Aag2_Ago2_BCsample ==0)
Ago1_Aag2_seed_known <- subset(res_Ago1_known$startOfPmatch, res_Ago1_known$Aag2_Ago1_BCsample > 0 & res_Ago1_known$Aag2_Ago2_BCsample ==0)
# #aegypti Ago1 known peak ranges
# Ago1_aegyptik_GR <- GRanges(seqnames = Ago1_aegypti_ranges$seqnames.x, strand = Ago1_aegypti_ranges$strand.x,
# ranges = IRanges(start =Ago1_aegypti_ranges$start.x, width =Ago1_aegypti_ranges$width.x ))
#
# ToPlot_aegypti_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptik_GR ,format = "bigwig")
# f <- plotRegion(ToPlot_aegypti_Ago1kalt,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1kalt <- f$data
#
# #aegypti Ago1 known control ranges
# Ago1_aegyptik_ctrlGR <- GRanges(seqnames = Ago1_aegypti_ctrl_ranges$seqnames.x, strand = Ago1_aegypti_ctrl_ranges$strand.x,
# ranges = IRanges(start =Ago1_aegypti_ctrl_ranges$start.x, width =Ago1_aegypti_ctrl_ranges$width.x ))
#
# ToPlot_ctrl_aegypti_kAgo1 <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptik_ctrlGR,format = "bigwig")
# ctrl_aegypti_kAgo1_peaks <- plotRegion(ToPlot_ctrl_aegypti_kAgo1,outliers = 0.05)
# ctrl_aegypti_kAgo1_peaks <- ctrl_aegypti_kAgo1_peaks$data
# ctrl_aegypti_kAgo1_peaks$Sample <- c("ctrl_peaks")
#
# aegypti_Ago1kalt <- rbind(ctrl_aegypti_kAgo1_peaks, aegypti_Ago1kalt )
#
# #Aag2 Ago1 known peak ranges
# Ago1_Aag2k_GR <- GRanges(seqnames = Ago1_Aag2_ranges$seqnames.x, strand = Ago1_Aag2_ranges$strand.x,
# ranges = IRanges(start = Ago1_Aag2_ranges$start.x, width =Ago1_Aag2_ranges$width.x ))
#
# ToPlot_Aag2_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2k_GR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1kalt,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1kalt <- f$data
#
# #Aag2 Ago1 known control ranges
# Ago1_Aag2k_ctrlGR <- GRanges(seqnames = Ago1_Aag2_ctrl_ranges$seqnames.x, strand = Ago1_Aag2_ctrl_ranges$strand.x,
# ranges = IRanges(start = Ago1_Aag2_ctrl_ranges$start.x, width =Ago1_Aag2_ctrl_ranges$width.x ))
#
# ToPlot_ctrl_Aag2_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2k_ctrlGR ,format = "bigwig")
# ctrlAag2_Ago1kalt_peaks <- plotRegion(ToPlot_ctrl_Aag2_Ago1kalt ,outliers = 0.05)
# ctrlAag2_Ago1kalt_peaks <- ctrlAag2_Ago1kalt_peaks$data
# ctrlAag2_Ago1kalt_peaks$Sample <- c("ctrl_peaks")
#
# Aag2_Ago1kalt <- rbind(ctrlAag2_Ago1kalt_peaks, Aag2_Ago1kalt)
# #save(Aag2_Ago1kalt, aegypti_Ago1kalt, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/knownmiRNAs.RData")
output file “knownmiRNAs.RData” is available on my Github
load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/knownmiRNAs.RData")
##now combine coverage with pattern matches by position and graph
nf <- c("Aag2_Ago1kalt", "aegypti_Ago1kalt")
norm_f <- do.call("list",mget(nf))
cov <- lapply(norm_f, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
covdf <- rbindlist(cov, idcol = TRUE)
covdf$xIndex <- covdf$xIndex - 1500
col.names <- colnames(covdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
covdf <- setNames(covdf, col.names)
cells_ago1 <- subset(covdf, covdf$ranges=="Aag2_Ago1kalt")
cells_ago1 <- subset(cells_ago1, cells_ago1$Sample=="ctrl_peaks" | cells_ago1$Sample=="cat_Aag2Ago1.bw")
aegypti_ago1 <- subset(covdf, covdf$ranges=="aegypti_Ago1kalt")
aegypti_ago1 <- subset(aegypti_ago1, aegypti_ago1$Sample=="ctrl_peaks" | aegypti_ago1$Sample=="cat_aegyptiAgo1.bw")
aegypti_ago1$merge <- ifelse(aegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(aegypti_ago1$position, "Ago1_aegypti_seed_known"), paste0(aegypti_ago1$position, "Ago1_aegypti_seed_known_ctrl"))
cells_ago1$merge <- ifelse(cells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(cells_ago1$position, "Ago1_Aag2_seed_known"), paste0(cells_ago1$position, "Ago1_Aag2_seed_known_ctrl"))
#get lists of pattern seed matches and then count all together for graphing
Ago1known <- grep("_seed_known", names(.GlobalEnv),value=TRUE)
Ago1known <- grep("Ago1_", Ago1known, value=TRUE)
Ago1known <- do.call("list",mget(Ago1known))
Ago1known <- lapply(Ago1known, as.data.frame)
Ago1known <- lapply(Ago1known, setNames, nm="position")
Ago1known_counts <- lapply(Ago1known, function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1known_counts<- lapply(Ago1known_counts, function(x) subset(x, x$position <= 300 & x$position >=100))
#make master df by sample name
Ago1df <- rbindlist(Ago1known_counts, idcol = TRUE)
Ago1df$position <- Ago1df$position - 200
Aag2Ago1k <- subset(Ago1df, Ago1df$.id=="Ago1_Aag2_seed_known_ctrl" | Ago1df$.id=="Ago1_Aag2_seed_known")
Aag2Ago1k$merge <- paste0(Aag2Ago1k$position, Aag2Ago1k$.id)
aegyptiAgo1k <- subset(Ago1df, Ago1df$.id=="Ago1_aegypti_seed_known_ctrl" | Ago1df$.id=="Ago1_aegypti_seed_known")
aegyptiAgo1k$merge <- paste0(aegyptiAgo1k$position, aegyptiAgo1k$.id)
#merge coverage with pattern match counts by position
aegypti_ago1 <- merge(aegyptiAgo1k, aegypti_ago1, by="merge")
cells_ago1 <- merge(Aag2Ago1k, cells_ago1, by="merge")
aegypti_ago1 <- aegypti_ago1 %>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
aegypti_ago1 <- aegypti_ago1 %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
cells_ago1 <- cells_ago1 %>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
cells_ago1 <-cells_ago1 %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
#smooth with spline function
aegypti_ago1k <-split(aegypti_ago1, aegypti_ago1$.id)
aegypti_ago1k <- lapply(aegypti_ago1k, function(x) x[order(x$position.x),])
aegypti_ago1k_spline <-lapply(aegypti_ago1k, function(x) spline(x$freq_norm))
aegypti_ago1k_spline <- rbindlist(aegypti_ago1k_spline, idcol = TRUE)
aegypti_ago1k_spline$x <- aegypti_ago1k_spline$x - 101
col.names <- colnames(aegypti_ago1k_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1k_spline <- setNames(aegypti_ago1k_spline, col.names)
ggplot(aegypti_ago1k_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#3360A9", "#999999")) + theme_bw()
cells_ago1k <-split(cells_ago1, cells_ago1$.id)
cells_ago1k <- lapply(cells_ago1k, function(x) x[order(x$position.x),])
cells_ago1k_spline <-lapply(cells_ago1k, function(x) spline(x$freq_norm))
cells_ago1k_spline <- rbindlist(cells_ago1k_spline, idcol = TRUE)
cells_ago1k_spline$x <- cells_ago1k_spline $x - 101
col.names <- colnames(cells_ago1k_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1k_spline <- setNames(cells_ago1k_spline, col.names)
ggplot(cells_ago1k_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#1B0B80", "#999999")) + theme_bw()
“aegypti_Ago1_novel_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github
#all peak ranges
res_Ago1_aegypti_novel <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/aegypti_Ago1_novel_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 13 patterns
## Searching for 13 patterns in peaks....
## Search for ATGGTA
## Search for GACCTA
## Search for CGGAAT
## Search for TGGTCA
## Search for ACACCA
## Search for CCTCAC
## Search for CGGCCT
## Search for GGCCTA
## Search for TACGTA
## Search for CTACCT
## Search for ATGTCG
## Search for TATGGG
## Search for TCACTA
## ....finished search
res_Ago1_aegypti_novel <- res_Ago1_aegypti_novel[[1]]
res_Ago1_aegypti_novel <- rbindlist(res_Ago1_aegypti_novel)
res_Ago1_aegypti_novel$group <- paste0(res_Ago1_aegypti_novel$SeqExtended, "_", res_Ago1_aegypti_novel$siteOfPattern)
res_Ago1_aegypti_novel$startOfPmatch <- as.numeric(res_Ago1_aegypti_novel$startOfPmatch)
res_Ago1_aegypti_novel$filt <- duplicated(res_Ago1_aegypti_novel$group)
res_Ago1_aegypti_novel <- subset(res_Ago1_aegypti_novel, res_Ago1_aegypti_novel$filt == FALSE)
#control ranges
res_Ago1_aegypti_novel_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/aegypti_Ago1_novel_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 13 patterns
## Searching for 13 patterns in peaks....
## Search for ATGGTA
## Search for GACCTA
## Search for CGGAAT
## Search for TGGTCA
## Search for ACACCA
## Search for CCTCAC
## Search for CGGCCT
## Search for GGCCTA
## Search for TACGTA
## Search for CTACCT
## Search for ATGTCG
## Search for TATGGG
## Search for TCACTA
## ....finished search
res_Ago1_aegypti_novel_ctrl <- res_Ago1_aegypti_novel_ctrl[[1]]
res_Ago1_aegypti_novel_ctrl <- rbindlist(res_Ago1_aegypti_novel_ctrl)
res_Ago1_aegypti_novel_ctrl$group <- paste0(res_Ago1_aegypti_novel_ctrl$SeqExtended, "_", res_Ago1_aegypti_novel_ctrl$siteOfPattern)
res_Ago1_aegypti_novel_ctrl$startOfPmatch <- as.numeric(res_Ago1_aegypti_novel_ctrl$startOfPmatch)
res_Ago1_aegypti_novel_ctrl$filt <- duplicated(res_Ago1_aegypti_novel_ctrl$group)
res_Ago1_aegypti_novel_ctrl <- subset(res_Ago1_aegypti_novel_ctrl, res_Ago1_aegypti_novel_ctrl$filt == FALSE)
#get GRs with the same peak subset for calculating specific coverage
Ago1_aegypti_nranges <- subset(res_Ago1_aegypti_novel, res_Ago1_aegypti_novel$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel$aegypti_Ago2_BCsample ==0)
Ago1_aegypti_ctrl_nranges <- subset(res_Ago1_aegypti_novel_ctrl, res_Ago1_aegypti_novel_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel_ctrl$aegypti_Ago2_BCsample ==0)
#subset matched patterns in peaks with support from aegypti Ago1 reads
Ago1_aegypti_seed_novel_ctrl <- subset(res_Ago1_aegypti_novel_ctrl$startOfPmatch, res_Ago1_aegypti_novel_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel_ctrl$aegypti_Ago2_BCsample ==0)
Ago1_aegypti_seed_novel <- subset(res_Ago1_aegypti_novel$startOfPmatch, res_Ago1_aegypti_novel$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel$aegypti_Ago2_BCsample ==0)
“cells_Ago1_novel_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github
#all peak ranges
res_Ago1_cells_novel <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/cells_Ago1_novel_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 15 patterns
## Searching for 15 patterns in peaks....
## Search for AAATCC
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CCAATC
## Search for CGGAAT
## Search for TGGTCA
## Search for AATCCA
## Search for ATAAAT
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 4 out-of-bound ranges located on sequence
## NIGP01000607. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## Search for TGCCGT
## Search for CTACCT
## Search for TATGGG
## ....finished search
res_Ago1_cells_novel <- res_Ago1_cells_novel[[1]]
res_Ago1_cells_novel <- rbindlist(res_Ago1_cells_novel)
res_Ago1_cells_novel$group <- paste0(res_Ago1_cells_novel$SeqExtended, "_", res_Ago1_cells_novel$siteOfPattern)
res_Ago1_cells_novel$startOfPmatch <- as.numeric(res_Ago1_cells_novel$startOfPmatch)
res_Ago1_cells_novel$filt <- duplicated(res_Ago1_cells_novel$group)
res_Ago1_cells_novel <- subset(res_Ago1_cells_novel, res_Ago1_cells_novel$filt == FALSE)
#control ranges
res_Ago1_cells_novel_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/cells_Ago1_novel_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 15 patterns
## Searching for 15 patterns in peaks....
## Search for AAATCC
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CCAATC
## Search for CGGAAT
## Search for TGGTCA
## Search for AATCCA
## Search for ATAAAT
## Search for TGCCGT
## Search for CTACCT
## Search for TATGGG
## ....finished search
res_Ago1_cells_novel_ctrl <- res_Ago1_cells_novel_ctrl[[1]]
res_Ago1_cells_novel_ctrl <- rbindlist(res_Ago1_cells_novel_ctrl)
res_Ago1_cells_novel_ctrl$group <- paste0(res_Ago1_cells_novel_ctrl$SeqExtended, "_", res_Ago1_cells_novel_ctrl$siteOfPattern)
res_Ago1_cells_novel_ctrl$startOfPmatch <- as.numeric(res_Ago1_cells_novel_ctrl$startOfPmatch)
res_Ago1_cells_novel_ctrl$filt <- duplicated(res_Ago1_cells_novel_ctrl$group)
res_Ago1_cells_novel_ctrl <- subset(res_Ago1_cells_novel_ctrl, res_Ago1_cells_novel_ctrl$filt == FALSE)
#get GRs with the same peak subset for calculating specific coverage
Ago1_Aag2_nranges <- subset(res_Ago1_cells_novel, res_Ago1_cells_novel$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel$Aag2_Ago2_BCsample == 0 )
Ago1_Aag2_ctrl_nranges <- subset(res_Ago1_cells_novel_ctrl, res_Ago1_cells_novel_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel_ctrl$Aag2_Ago2_BCsample == 0 )
#subset matched patterns in peaks with support from Aag2 Ago1 reads
Ago1_Aag2_seed_novel_ctrl <- subset(res_Ago1_cells_novel_ctrl$startOfPmatch, res_Ago1_cells_novel_ctrl$Aag2_Ago1_BCsample> 0 & res_Ago1_cells_novel_ctrl$Aag2_Ago2_BCsample ==0)
Ago1_Aag2_seed_novel <- subset(res_Ago1_cells_novel$startOfPmatch, res_Ago1_cells_novel$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel$Aag2_Ago2_BCsample == 0 )
# #aegypti Ago1 novel peak ranges
# Ago1_aegyptin_GR <- GRanges(seqnames = Ago1_aegypti_nranges$seqnames.x, strand = Ago1_aegypti_nranges$strand.x, ranges = IRanges(start = Ago1_aegypti_nranges$start.x, width =Ago1_aegypti_nranges$width.x ))
#
# toPlot_aegypti_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptin_GR ,format = "bigwig")
# f <- plotRegion(toPlot_aegypti_Ago1nalt,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1nalt <- f$data
#
# #aegypti Ago1 novel control ranges
# Ago1_aegyptin_ctrlGR <- GRanges(seqnames = Ago1_aegypti_ctrl_nranges$seqnames.x, strand = Ago1_aegypti_ctrl_nranges$strand.x,ranges = IRanges(start = Ago1_aegypti_ctrl_nranges$start.x, width =Ago1_aegypti_ctrl_nranges$width.x ))
#
# ToPlot_ctrl_aegypti_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptin_ctrlGR,format = "bigwig")
# ctrlaegypti_Ago1nalt_peaks <- plotRegion(ToPlot_ctrl_aegypti_Ago1nalt ,outliers = 0.05)
# ctrlaegypti_Ago1nalt_peaks <- ctrlaegypti_Ago1nalt_peaks$data
# ctrlaegypti_Ago1nalt_peaks$Sample <- c("ctrl_peaks")
#
# aegypti_Ago1nalt <- rbind(ctrlaegypti_Ago1nalt_peaks, aegypti_Ago1nalt)
#
# #Aag2 Ago1 novel peak ranges
# Ago1_Aag2n_GR <- GRanges(seqnames = Ago1_Aag2_nranges$seqnames.x, strand = Ago1_Aag2_nranges$strand.x,
# ranges = IRanges(start = Ago1_Aag2_nranges$start.x, width =Ago1_Aag2_nranges$width.x ))
#
# ToPlot_Aag2_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2n_GR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1nalt,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1nalt <- f$data
#
# #Aag2 Ago1 novel control ranges
#
# Ago1_Aag2n_ctrlGR <- GRanges(seqnames = Ago1_Aag2_ctrl_nranges$seqnames.x, strand = Ago1_Aag2_ctrl_nranges$strand.x,ranges = IRanges(start = Ago1_Aag2_ctrl_nranges$start.x, width =Ago1_Aag2_ctrl_nranges$width.x ))
#
# ToPlot_ctrl_Aag2_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2n_ctrlGR ,format = "bigwig")
# ctrlAag2_Ago1nalt_peaks <- plotRegion(ToPlot_ctrl_Aag2_Ago1nalt ,outliers = 0.05)
# ctrlAag2_Ago1nalt_peaks <- ctrlAag2_Ago1nalt_peaks$data
# ctrlAag2_Ago1nalt_peaks$Sample <- c("ctrl_peaks")
#
# Aag2_Ago1nalt <- rbind(ctrlAag2_Ago1nalt_peaks, Aag2_Ago1nalt)
#
# #save(Aag2_Ago1nalt, aegypti_Ago1nalt, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/novelmiRNAs.RData")
output file “novelmiRNAs.RData” is available on my Github
load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/novelmiRNAs.RData")
nf <- c("Aag2_Ago1nalt", "aegypti_Ago1nalt")
norm_f <- do.call("list",mget(nf))
cov <- lapply(norm_f, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
covdf <- rbindlist(cov, idcol = TRUE)
covdf$xIndex <- covdf$xIndex - 1500
col.names <- colnames(covdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
covdf <- setNames(covdf, col.names)
cells_ago1 <- subset(covdf, covdf$ranges=="Aag2_Ago1nalt")
aegypti_ago1 <- subset(covdf, covdf$ranges=="aegypti_Ago1nalt")
cells_ago1$merge <- ifelse(cells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(cells_ago1$position, "Ago1_Aag2_seed_novel"), paste0(cells_ago1$position, "Ago1_Aag2_seed_novel_ctrl"))
aegypti_ago1$merge <- ifelse(aegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(aegypti_ago1$position, "Ago1_aegypti_seed_novel"), paste0(aegypti_ago1$position, "Ago1_aegypti_seed_novel_ctrl"))
#make list of pat matches and then count all together for graphing
Ago1novel <- grep("_seed_novel", names(.GlobalEnv),value=TRUE)
Ago1novel <- grep("Ago1_", Ago1novel, value=TRUE)
Ago1novel <- do.call("list",mget(Ago1novel))
Ago1novel <- lapply(Ago1novel, as.data.frame)
Ago1novel <- lapply(Ago1novel, setNames, nm="position")
Ago1novel_counts <- lapply(Ago1novel, function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1novel_counts <- lapply(Ago1novel_counts, function(x) subset(x, x$position <= 300 & x$position >=100))
#make master df by sample name
Ago1noveldf <- rbindlist(Ago1novel_counts, idcol = TRUE)
Ago1noveldf$position <- Ago1noveldf$position - 200
Aag2Ago1n <- subset(Ago1noveldf, Ago1noveldf$.id=="Ago1_Aag2_seed_novel_ctrl" | Ago1noveldf$.id=="Ago1_Aag2_seed_novel")
Aag2Ago1n$merge <- paste0(Aag2Ago1n$position,Aag2Ago1n$.id)
aegyptiAgo1n <- subset(Ago1noveldf, Ago1noveldf$.id=="Ago1_aegypti_seed_novel_ctrl" | Ago1noveldf$.id=="Ago1_aegypti_seed_novel")
aegyptiAgo1n$merge <- paste0(aegyptiAgo1n$position, aegyptiAgo1n$.id)
aegypti_ago1 <- merge(aegyptiAgo1n, aegypti_ago1, by="merge")
cells_ago1 <- merge(Aag2Ago1n, cells_ago1, by="merge")
aegypti_ago1 <- aegypti_ago1 %>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
aegypti_ago1 <- aegypti_ago1 %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
aegypti_ago1n <-split(aegypti_ago1, aegypti_ago1$.id)
aegypti_ago1n <- lapply(aegypti_ago1n, function(x) x[order(x$position.x),])
aegypti_ago1n_spline <-lapply(aegypti_ago1n, function(x) spline(x$freq_norm))
aegypti_ago1n_spline <- rbindlist(aegypti_ago1n_spline, idcol = TRUE)
aegypti_ago1n_spline$x <- aegypti_ago1n_spline$x - 101
col.names <- colnames(aegypti_ago1n_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1n_spline <- setNames(aegypti_ago1n_spline, col.names)
ggplot(aegypti_ago1n_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#3360A9", "#999999")) + theme_bw()
cells_ago1 <- cells_ago1 %>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
cells_ago1 <-cells_ago1 %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
cells_ago1n <-split(cells_ago1, cells_ago1$.id)
cells_ago1n <- lapply(cells_ago1n, function(x) x[order(x$position.x),])
cells_ago1n_spline <-lapply(cells_ago1n, function(x) spline(x$freq_norm))
cells_ago1n_spline <- rbindlist(cells_ago1n_spline, idcol = TRUE)
cells_ago1n_spline$x <- cells_ago1n_spline $x - 101
col.names <- colnames(cells_ago1n_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1n_spline <- setNames(cells_ago1n_spline, col.names)
ggplot(cells_ago1n_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#1B0B80", "#999999")) + theme_bw()
“known_miRNAs_novelab_mos_fam_rn.fa” file with 6mer target sequences of known miRNAs of similar abundance as the most abundant novel miRNAs in mosquitoes was generated in smallRNA_abundance_scatterplots script and is available in my Github
#all peak ranges
res_Ago1_av_known <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_mos_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 6 patterns
## Searching for 6 patterns in peaks....
## Search for GATATT
## Search for GCACAA
## Search for TAAACC
## Search for TACCTC
## Search for TCAGGG
## Search for TGCCTT
## ....finished search
res_Ago1_av_known <- res_Ago1_av_known[[1]]
res_Ago1_av_known <- rbindlist(res_Ago1_av_known)
res_Ago1_av_known$group <- paste0(res_Ago1_av_known$SeqExtended, "_", res_Ago1_av_known$siteOfPattern)
res_Ago1_av_known$startOfPmatch <- as.numeric(res_Ago1_av_known$startOfPmatch)
res_Ago1_av_known$filt <- duplicated(res_Ago1_av_known$group)
res_Ago1_av_known <- subset(res_Ago1_av_known, res_Ago1_av_known$filt == FALSE)
#control ranges
res_Ago1_av_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_mos_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 6 patterns
## Searching for 6 patterns in peaks....
## Search for GATATT
## Search for GCACAA
## Search for TAAACC
## Search for TACCTC
## Search for TCAGGG
## Search for TGCCTT
## ....finished search
res_Ago1_av_known_ctrl <- res_Ago1_av_known_ctrl[[1]]
res_Ago1_av_known_ctrl <- rbindlist(res_Ago1_av_known_ctrl)
res_Ago1_av_known_ctrl$group <- paste0(res_Ago1_av_known_ctrl$SeqExtended, "_", res_Ago1_av_known_ctrl$siteOfPattern)
res_Ago1_av_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_av_known_ctrl$startOfPmatch)
res_Ago1_av_known_ctrl$filt <- duplicated(res_Ago1_av_known_ctrl$group)
res_Ago1_av_known_ctrl <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$filt == FALSE)
#subset matched patterns in peaks with support from aegypti Ago1 reads
res_Ago1_av_aegypti_known_ctrl <- subset(res_Ago1_av_known_ctrl$startOfPmatch, res_Ago1_av_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$aegypti_Ago2_BCsample == 0)
res_Ago1_av_aegypti_known <- subset(res_Ago1_av_known$startOfPmatch, res_Ago1_av_known$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known$aegypti_Ago2_BCsample == 0)
#get GRs with the same peak subset for calculating specific coverage
res_Ago1_av_aegypti_known_ctrl_pats <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$aegypti_Ago2_BCsample == 0)
res_Ago1_av_aegypti_known_pats <- subset(res_Ago1_av_known, res_Ago1_av_known$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known$aegypti_Ago2_BCsample == 0)
“known_miRNAs_novelab_cells_fam_rn.fa” file with 6mer target sequences of known miRNAs of similar abundance as the most abundant novel miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github
#all peak ranges
res_Ago1_av_known <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_cells_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
## sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
## NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
## NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
## NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
## NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
## NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
## NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
## NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
## NIGP01002255, and NIGP01002260. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence
## are not considered out-of-bound (use seqlengths() and isCircular()
## to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 4 patterns
## Searching for 4 patterns in peaks....
## Search for ACGCTT
## Search for AGTATT
## Search for AGTGAG
## Search for TGGTAA
## ....finished search
res_Ago1_av_known <- res_Ago1_av_known[[1]]
res_Ago1_av_known <- rbindlist(res_Ago1_av_known)
res_Ago1_av_known$group <- paste0(res_Ago1_av_known$SeqExtended, "_", res_Ago1_av_known$siteOfPattern)
res_Ago1_av_known$startOfPmatch <- as.numeric(res_Ago1_av_known$startOfPmatch)
res_Ago1_av_known$filt <- duplicated(res_Ago1_av_known$group)
res_Ago1_av_known <- subset(res_Ago1_av_known, res_Ago1_av_known$filt == FALSE)
#control ranges
res_Ago1_av_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_cells_fam_rn.fa",
resize=406,
add5=1,
add3=1,
verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
## NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
## NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
## NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
## NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
## NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
## NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
## NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
## NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
## NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
## NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
## NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
## NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
## NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
## NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
## NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
## NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
## NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
## NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
## NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
## NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
## NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
## NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
## NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
## NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
## NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
## NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
## NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
## NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
## NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
## NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
## NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
## NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
## NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
## NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
## NIGP01002295. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
## sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
## NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
## NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
## NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
## NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
## NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
## NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
## NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
## NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
## NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
## NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
## NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
## NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
## NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
## NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
## NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
## NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
## NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
## NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
## NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
## NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
## NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
## NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
## NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
## NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
## NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
## NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
## NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
## NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
## NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
## NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
## NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
## NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
## NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
## NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
## NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
## NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
## NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
## NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
## NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
## NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
## NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
## NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
## NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
## NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
## NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
## NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
## NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
## NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
## NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
## NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
## NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
## NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
## NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
## located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 4 patterns
## Searching for 4 patterns in peaks....
## Search for ACGCTT
## Search for AGTATT
## Search for AGTGAG
## Search for TGGTAA
## ....finished search
res_Ago1_av_known_ctrl <- res_Ago1_av_known_ctrl[[1]]
res_Ago1_av_known_ctrl <- rbindlist(res_Ago1_av_known_ctrl)
res_Ago1_av_known_ctrl$group <- paste0(res_Ago1_av_known_ctrl$SeqExtended, "_", res_Ago1_av_known_ctrl$siteOfPattern)
res_Ago1_av_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_av_known_ctrl$startOfPmatch)
res_Ago1_av_known_ctrl$filt <- duplicated(res_Ago1_av_known_ctrl$group)
res_Ago1_av_known_ctrl <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$filt == FALSE)
#subset matched patterns in peaks with support from Aag2 Ago1 reads
res_Ago1_av_cells_known_ctrl <- subset(res_Ago1_av_known_ctrl$startOfPmatch, res_Ago1_av_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$Aag2_Ago2_BCsample == 0)
res_Ago1_av_cells_known <- subset(res_Ago1_av_known$startOfPmatch, res_Ago1_av_known$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known$Aag2_Ago2_BCsample == 0)
#get GRs with the same peak subset for calculating specific coverage
res_Ago1_av_cells_known_ctrl_pats <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$Aag2_Ago2_BCsample == 0)
res_Ago1_av_cells_known_pats <- subset(res_Ago1_av_known, res_Ago1_av_known$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known$Aag2_Ago2_BCsample == 0)
# #aegypti Ago1 known (novel ab) peak ranges
# Ago1_aegypti_avk_GR <- GRanges(seqnames = res_Ago1_av_aegypti_known_pats$seqnames.x, strand = res_Ago1_av_aegypti_known_pats$strand.x, ranges = IRanges(start = res_Ago1_av_aegypti_known_pats$start.x, width =res_Ago1_av_aegypti_known_pats$width.x ))
#
# ToPlot_ctrl_Ago1_aegypti_avk <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegypti_avk_GR,format = "bigwig")
# aegypti_Ago1_avk <- plotRegion(ToPlot_ctrl_Ago1_aegypti_avk ,outliers = 0.05)
# aegypti_Ago1_avk <- aegypti_Ago1_avk$data
#
# #aegypti Ago1 known (novel ab) control ranges
# Ago1_aegypti_avk_ctrlGR <- GRanges(seqnames = res_Ago1_av_aegypti_known_ctrl_pats$seqnames.x, strand = res_Ago1_av_aegypti_known_ctrl_pats$strand.x,ranges = IRanges(start = res_Ago1_av_aegypti_known_ctrl_pats$start.x, width =res_Ago1_av_aegypti_known_ctrl_pats$width.x ))
#
# toPlot_aegypti_avk_ctrl <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegypti_avk_ctrlGR ,format = "bigwig")
# aegypti_Ago1_avk_ctrl <- plotRegion(toPlot_aegypti_avk_ctrl,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1_avk_ctrl <- aegypti_Ago1_avk_ctrl$data
# aegypti_Ago1_avk_ctrl$Sample <- c("ctrl_peaks")
#
# aegypti_Ago1_avk <- rbind(aegypti_Ago1_avk, aegypti_Ago1_avk_ctrl)
#
# #Aag2 Ago1 known (novel ab) peak ranges
# Ago1_Aag2_avk_GR <- GRanges(seqnames = res_Ago1_av_cells_known_pats$seqnames.x, strand = res_Ago1_av_cells_known_pats$strand.x, ranges = IRanges(start = res_Ago1_av_cells_known_pats$start.x, width =res_Ago1_av_cells_known_pats$width.x ))
#
# ToPlot_Aag2_Ago1_avk <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2_avk_GR ,format = "bigwig")
# Aag2_Ago1_avk <- plotRegion(ToPlot_Aag2_Ago1_avk ,outliers = 0.05)
# Aag2_Ago1_avk <- Aag2_Ago1_avk$data
#
# #Aag2 Ago1 known (novel ab) control ranges
# Ago1_Aag2_avk_ctrlGR <- GRanges(seqnames = res_Ago1_av_cells_known_ctrl_pats$seqnames.x, strand = res_Ago1_av_cells_known_ctrl_pats$strand.x, ranges = IRanges(start = res_Ago1_av_cells_known_ctrl_pats$start.x, width =res_Ago1_av_cells_known_ctrl_pats$width.x ))
#
# ToPlot_Aag2_Ago1_avk_ctrl <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2_avk_ctrlGR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1_avk_ctrl,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1_avk_ctrl <- f$data
# Aag2_Ago1_avk_ctrl$Sample <- c("ctrl_peaks")
#
# Aag2_Ago1_avk <- rbind(Aag2_Ago1_avk_ctrl, Aag2_Ago1_avk)
#
# #save(Aag2_Ago1_avk, aegypti_Ago1_avk, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_novelabmiRNAs.RData")
output file “known_novelabmiRNAs.RData” is available on my Github
load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_novelabmiRNAs.RData")
Ago1_av_known <- grep("res_Ago1_av_a", names(.GlobalEnv),value=TRUE)
Ago1_av_known <- c(Ago1_av_known, grep("res_Ago1_av_c", names(.GlobalEnv),value=TRUE))
Ago1_av_known <- grep("pats", Ago1_av_known,value=TRUE, invert = TRUE)
Ago1_av_known <- do.call("list",mget(Ago1_av_known ))
Ago1_av_known <- lapply(Ago1_av_known , as.data.frame)
Ago1_av_known <- lapply(Ago1_av_known , setNames, nm="position")
Ago1_av_known_counts <- lapply(Ago1_av_known , function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1_av_known_counts <- lapply(Ago1_av_known_counts, function(x) subset(x, x$position <= 300 & x$position >=100))
Ago1avdf <- rbindlist(Ago1_av_known_counts, idcol = TRUE)
Ago1avdf$position <- Ago1avdf$position - 200
Aag2Ago1av <- subset(Ago1avdf, Ago1avdf$.id=="res_Ago1_av_cells_known_ctrl" | Ago1avdf$.id=="res_Ago1_av_cells_known")
Aag2Ago1av$merge <- paste0(Aag2Ago1av$position,Aag2Ago1av$.id)
aegyptiAgo1av <- subset(Ago1avdf, Ago1avdf$.id=="res_Ago1_av_aegypti_known_ctrl" | Ago1avdf$.id=="res_Ago1_av_aegypti_known")
aegyptiAgo1av$merge <- paste0(aegyptiAgo1av$position, aegyptiAgo1av$.id)
av <- c("Aag2_Ago1_avk", "aegypti_Ago1_avk")
norm_av <- do.call("list",mget(av))
avcov <- lapply(norm_av, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
avcovdf <- rbindlist(avcov, idcol = TRUE)
avcovdf$xIndex <- avcovdf$xIndex - 1500
col.names <- colnames(avcovdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
avcovdf <- setNames(avcovdf, col.names)
avcells_ago1 <- subset(avcovdf, avcovdf$ranges=="Aag2_Ago1_avk")
avaegypti_ago1 <- subset(avcovdf, avcovdf$ranges=="aegypti_Ago1_avk")
avaegypti_ago1$merge <- ifelse(avaegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(avaegypti_ago1$position, "res_Ago1_av_aegypti_known"), paste0(avaegypti_ago1$position, "res_Ago1_av_aegypti_known_ctrl"))
avcells_ago1$merge <- ifelse(avcells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(avcells_ago1$position, "res_Ago1_av_cells_known"), paste0(avcells_ago1$position, "res_Ago1_av_cells_known_ctrl"))
aegyptiAgo1av <- merge(aegyptiAgo1av, avaegypti_ago1, by="merge")
Aag2Ago1av <- merge(Aag2Ago1av, avcells_ago1, by="merge")
aegypti_ago1av <- aegyptiAgo1av %>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
aegypti_ago1av <- aegypti_ago1av %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
cells_ago1av <- Aag2Ago1av%>%
group_by(Sample) %>%
mutate( counts_norm = (count*Score))
cells_ago1av <-cells_ago1av %>%
group_by(Sample) %>%
mutate(freq_norm = counts_norm/(sum(counts_norm)))
aegypti_ago1av <-split(aegypti_ago1av, aegypti_ago1av$.id)
aegypti_ago1av <- lapply(aegypti_ago1av, function(x) x[order(x$position.x),])
aegypti_ago1av_spline <-lapply(aegypti_ago1av, function(x) spline(x$freq_norm))
aegypti_ago1av_spline <- rbindlist(aegypti_ago1av_spline, idcol = TRUE)
aegypti_ago1av_spline$x <- aegypti_ago1av_spline$x - 101
col.names <- colnames(aegypti_ago1av_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1av_spline <- setNames(aegypti_ago1av_spline, col.names)
ggplot(aegypti_ago1av_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#3360A9", "#999999")) + theme_bw()
cells_ago1av <-split(cells_ago1av, cells_ago1av$.id)
cells_ago1av <- lapply(cells_ago1av, function(x) x[order(x$position.x),])
cells_ago1av_spline <-lapply(cells_ago1av, function(x) spline(x$freq_norm))
cells_ago1av_spline <- rbindlist(cells_ago1av_spline, idcol = TRUE)
cells_ago1av_spline$x <- cells_ago1av_spline $x - 101
col.names <- colnames(cells_ago1av_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1av_spline <- setNames(cells_ago1av_spline, col.names)
ggplot(cells_ago1av_spline , aes(x = x, y = y)) +
geom_line(aes(color = sample)) +
scale_color_manual(values = c("#1B0B80", "#999999")) + theme_bw()