Seed search functions

Additionally, see CLIPflexR documentation for:
fetchSequencesForCLIP
annotatePeaksWithPatterns

 fetchSequencesForClIP <- function(peaks,reSize=NULL,fasta,add5=0,add3=0){
  swd <- Rsamtools::FaFile(fasta)
  
  fastaLens <- swd %>%  seqlengths
  start(peaks) <- start(peaks)-add5
  end(peaks) <- end(peaks)+add3
  sees <- seqlevels(peaks)
  seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
  
  
  
  Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
  
  if(!is.null(reSize)){
    resizePeaks <- resize(peaks, reSize, fix="center", use.names=TRUE)
    
  }else{
    resizePeaks <- peaks
  }
  names(resizePeaks) <- paste0(seqnames(resizePeaks),":",start(resizePeaks),"-",end(resizePeaks))
  resizePeaks <- trim(resizePeaks)
  temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
  temp2 <- Rsamtools::getSeq(Rsamtools:::FaFile(fasta),resizePeaks[temp@from])
  names(temp2) <- names(resizePeaks[temp@from])
  temp2
}



annotatePeaksWithPatterns  <- function(peaks,fasta,patterns,resize=64,add5=0,add3=0,verbose=FALSE,checkReverse=TRUE){
  require(GenomicRanges)
  require(magrittr)
  require(Biostrings)
  require(Rsamtools)
  
  if(verbose) message("Reading in peaks....",appendLF = FALSE)
  if(is(peaks,"GRanges")){
    peaks <- peaks
    peaks$originalPeak <-  paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
  }else if(is(peaks,"character")){
    if(!file.exists(peaks))stop(paste0("The file ",peaks," does not exist"))
    peaks <- read.table(peaks, header = TRUE)
    peaks <- makeGRangesFromDataFrame(peaks,
                                      keep.extra.columns=TRUE,
                                      ignore.strand=FALSE,
                                      seqinfo=NULL,
                                      seqnames.field="seqnames",
                                      start.field="start",
                                      end.field="end",
                                      strand.field="strand",
                                      starts.in.df.are.0based=FALSE)
    peaks$originalPeak <-  paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), ":", strand(peaks))
    
  }
  if(verbose) message("...done")
  if(verbose) message("Read in ",length(peaks)," peaks")
  
  
  if(!file.exists(fasta))stop(paste0("The file ",fasta," does not exist"))
  if(verbose) message("Indexing FASTA...",appendLF = FALSE)
  indexFa(fasta)
  if(verbose) message("...done")
  
  
  if(verbose) message("Aligning seqlevels across peaks and FASTA...",appendLF = FALSE)
  fastaLens <- FaFile(fasta) %>% seqlengths
  sees <- seqlevels(peaks)
  seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
  if(verbose) message("...done")
  
  if(verbose) message("Removing invalid peaks which are outside contig boundaries...",appendLF = FALSE)
  Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
  resizePeaks <- resize(peaks, resize, fix="center", use.names=TRUE)
  resizePeaks <- trim(resizePeaks)
  names(resizePeaks) <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
  temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
  validPeaks <- resizePeaks[temp@from]
  validPeaks$extendedPeak <- paste0(seqnames(validPeaks),":",start(validPeaks),"_",end(validPeaks), "_", strand(validPeaks))
  if(verbose) message("...done")
  if(verbose) message("Removed in ",length(validPeaks)-length(resizePeaks)," peaks")
  
  if(verbose) message("Retrieving sequence from under peaks....",appendLF = FALSE)
  myRes <- getSeq(Rsamtools:::FaFile(fasta),validPeaks)

  names(myRes) <- validPeaks$extendedPeak
  
  validPeaks$seqUnderExtendedPeak <- myRes
  if(verbose) message("...done")
  
  if(verbose) message("Retrieving patterns to search for....",appendLF = FALSE)
  if(length(patterns) > 1){
    pattern=patterns
  }else{
    motifSeq <- readDNAStringSet(patterns)
    pattern <- as.character(motifSeq)
  }
  if(verbose) message("...done")
  if(verbose) message("Read in ",length(pattern)," patterns")
  
  if(verbose) message("Searching for ",length(pattern)," patterns in peaks....")
  mergePeaksAndSites <- patternCallList <- list()
  for(i in 1:length(pattern)){
    if(verbose) message("Search for ",pattern[i])
    peaks_Sites <- validPeaks
    fixedInRegion <- vmatchPattern(pattern=pattern[i],subject=myRes) %>% unlist()
    
    
    startOfPmatch <- resize(fixedInRegion,width=1,fix="start") %>% unname %>% as.character
    pos <- matrix(unlist(strsplit(gsub(".*:","",names(fixedInRegion)),"_")),ncol=3,byrow =T)
    seq <- gsub(":.*","",names(fixedInRegion))
    sitesGRpos <- GRanges(seq[pos[,3] =="+"],
                          IRanges(as.numeric(pos[pos[,3] =="+",1])+start(fixedInRegion[pos[,3] =="+"])-1,
                                  as.numeric(pos[pos[,3] =="+",1])+end(fixedInRegion[pos[,3] =="+"])-1),startOfPmatch=startOfPmatch[pos[,3] =="+"])
    sitesGRpos$PeakID <- names(fixedInRegion[pos[,3] =="+"])
    
    sitesGRneg <- GRanges(seq[pos[,3] =="-"],
                          IRanges(as.numeric(pos[pos[,3] =="-",2])-end(fixedInRegion[pos[,3] =="-"])+1,
                                  as.numeric(pos[pos[,3] =="-",2])-start(fixedInRegion[pos[,3] =="-"])+1),startOfPmatch=startOfPmatch[pos[,3] =="-"])
    sitesGRneg$PeakID <- names(fixedInRegion[pos[,3] =="-"])
  
      sitesFA <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta),reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta)))
      sitesFAExtended <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta,add5=add5,add3=add3),
                           reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta,add5=add5,add3=add3))
      )
      sitesGR <- suppressWarnings(c(sitesGRpos,sitesGRneg))
    sitesGR$Seq <- sitesFA
    sitesGR$SeqExtended <- sitesFAExtended
    sitesGR$siteOfPattern <- granges(sitesGR) %>% unname %>% as.character
    sitesGR$centerOfPattern <-  resize(sitesGR,width=1,fix="center") %>% unname %>% as.character
    dfSitesGR <- as.data.frame(sitesGR)
    peaks_SitesDF <- as.data.frame(peaks_Sites)
    
    mergePeaksAndSites[[i]] <- merge(peaks_SitesDF,dfSitesGR,by.x="extendedPeak",by.y="PeakID",all=FALSE)
    patternCallList[[i]] <- mergePeaksAndSites[[i]] %>% dplyr::select(Seq,SeqExtended,startOfPmatch,centerOfPattern,siteOfPattern,originalPeak,extendedPeak)
    rm(peaks_SitesDF)
    rm(peaks_Sites)
  }
  if(verbose) message("....finished search")
  names(mergePeaksAndSites) <- names(patternCallList) <- as.character(pattern)
  return(list(mergePeaksAndSites,patternCallList))
}

Get peak ranges and control ranges

read in all peaks from matrix building and PCA filtering output; input file “Final_matrix_all_peaks_add_filtering.txt” is peak matrix with all sample and filtering information, generated in PCA_annotation_fgsea_targetvenns script, and available in my Github

#read in all peaks, with filtering stats/etc added from PCA output, available on my github
all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE)

#make GR for seed search
allGR <- makeGRangesFromDataFrame(all,
                                     keep.extra.columns=TRUE,
                                     ignore.strand=FALSE,
                                     seqinfo=NULL,
                                     seqnames.field="seqnames",
                                     start.field="start",
                                     end.field="end",
                                     strand.field="strand",
                                     starts.in.df.are.0based=FALSE)

allctrlGR <- GenomicRanges::shift(allGR, 4000)  #note, annotations are no longer valid

Search top 10 most abundant known miRNA family 6mers in mosquitoes

“top10_aegypti_Ago1_known_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github

“Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa” is available for download at Vectorbase

Additionally, see CLIPflexR documentation for:
annotatePeaksWithPatterns

#all peak ranges
res <- annotatePeaksWithPatterns(peaks=allGR,
                                  fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
                                patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_aegypti_Ago1_known_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Loading required package: Rsamtools
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ACTGCC
## Search for AGTATT
## Search for ATACTC
## Search for CATTCC
## Search for CCGTCC
## Search for CTCTCT
## Search for GATCTC
## Search for GCATTT
## Search for GTGTTC
## Search for TGTGAT
## ....finished search
res <- res[[1]]
res <- rbindlist(res)

#make sure are only counting pattern matches 1X
#could have dups if extended sequence of one peak overlaps with the extended peak of another sequence and then a match is in both; 
#if site of pattern and extended seq around pattern match is the same, randomly remove duplicates
res$group <- paste0(res$SeqExtended, "_", res$siteOfPattern)
res$startOfPmatch <- as.numeric(res$startOfPmatch)
res$filt <- duplicated(res$group)
res_Ago1_known <- subset(res, res$filt == FALSE)

#control ranges
res_Ago1_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_aegypti_Ago1_known_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ACTGCC
## Search for AGTATT
## Search for ATACTC
## Search for CATTCC
## Search for CCGTCC
## Search for CTCTCT
## Search for GATCTC
## Search for GCATTT
## Search for GTGTTC
## Search for TGTGAT
## ....finished search
res_Ago1_known_ctrl <- res_Ago1_known_ctrl[[1]]
res_Ago1_known_ctrl <- rbindlist(res_Ago1_known_ctrl)
res_Ago1_known_ctrl$group <- paste0(res_Ago1_known_ctrl$SeqExtended, "_", res_Ago1_known_ctrl$siteOfPattern)
length(unique(res_Ago1_known_ctrl$group)) #62644
## [1] 61733
res_Ago1_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_known_ctrl$startOfPmatch)
res_Ago1_known_ctrl$filt <- duplicated(res_Ago1_known_ctrl$group)
res_Ago1_known_ctrl <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$filt == FALSE)

#get GRs with the same peak subset for calculating specific coverage
Ago1_aegypti_ranges <- subset(res_Ago1_known, res_Ago1_known$aegypti_Ago1_BCsample > 0 & res_Ago1_known$aegypti_Ago2_BCsample ==0 )
Ago1_aegypti_ctrl_ranges <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_known_ctrl$aegypti_Ago2_BCsample == 0)

#subset matched patterns in peaks with support from aegypti Ago1 reads
Ago1_aegypti_seed_known_ctrl <- subset(res_Ago1_known_ctrl$startOfPmatch, res_Ago1_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_known_ctrl$aegypti_Ago2_BCsample == 0)
Ago1_aegypti_seed_known <- subset(res_Ago1_known$startOfPmatch, res_Ago1_known$aegypti_Ago1_BCsample > 0 & res_Ago1_known$aegypti_Ago2_BCsample ==0 )

Search top 10 most abundant known miRNA family 6mers in cells

“top10_aegypti_Ago1_known_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github

#all peak ranges
res <- annotatePeaksWithPatterns(peaks=allGR,
                                  fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
                                patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_cells_Ago1_known_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ATACTC
## Search for CCGTCC
## Search for CTAGTC
## Search for GAGATT
## Search for GATATT
## Search for GATCTC
## Search for GTACAA
## Search for GTGTTC
## Search for TACCTG
## Search for TGTGAT
## ....finished search
res <- res[[1]]
res <- rbindlist(res)
res$group <- paste0(res$SeqExtended, "_", res$siteOfPattern)
res$startOfPmatch <- as.numeric(res$startOfPmatch)
res$filt <- duplicated(res$group)
res_Ago1_known <- subset(res, res$filt == FALSE)

#control ranges
res_Ago1_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/top10_cells_Ago1_known_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 10 patterns
## Searching for 10 patterns in peaks....
## Search for ATACTC
## Search for CCGTCC
## Search for CTAGTC
## Search for GAGATT
## Search for GATATT
## Search for GATCTC
## Search for GTACAA
## Search for GTGTTC
## Search for TACCTG
## Search for TGTGAT
## ....finished search
res_Ago1_known_ctrl <- res_Ago1_known_ctrl[[1]]
res_Ago1_known_ctrl <- rbindlist(res_Ago1_known_ctrl)
res_Ago1_known_ctrl$group <- paste0(res_Ago1_known_ctrl$SeqExtended, "_", res_Ago1_known_ctrl$siteOfPattern)
res_Ago1_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_known_ctrl$startOfPmatch)
res_Ago1_known_ctrl$filt <- duplicated(res_Ago1_known_ctrl$group)
res_Ago1_known_ctrl <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$filt == FALSE)

#get GRs with the same peak subset for calculating specific coverage
Ago1_Aag2_ranges <- subset(res_Ago1_known, res_Ago1_known$Aag2_Ago1_BCsample > 0 & res_Ago1_known$Aag2_Ago2_BCsample ==0 )
Ago1_Aag2_ctrl_ranges <- subset(res_Ago1_known_ctrl, res_Ago1_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_known_ctrl$Aag2_Ago2_BCsample ==0 )

#subset matched patterns in peaks with support from Aag2 Ago1 reads
Ago1_Aag2_seed_known_ctrl <- subset(res_Ago1_known_ctrl$startOfPmatch, res_Ago1_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_known_ctrl$Aag2_Ago2_BCsample ==0)
Ago1_Aag2_seed_known <- subset(res_Ago1_known$startOfPmatch, res_Ago1_known$Aag2_Ago1_BCsample > 0 & res_Ago1_known$Aag2_Ago2_BCsample ==0)

Get coverage by position around peaks containing patterns; known miRNAs

# #aegypti Ago1 known peak ranges
# Ago1_aegyptik_GR <-  GRanges(seqnames = Ago1_aegypti_ranges$seqnames.x, strand = Ago1_aegypti_ranges$strand.x,
#                           ranges = IRanges(start =Ago1_aegypti_ranges$start.x, width =Ago1_aegypti_ranges$width.x ))
# 
# ToPlot_aegypti_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptik_GR  ,format = "bigwig")
# f <- plotRegion(ToPlot_aegypti_Ago1kalt,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1kalt <- f$data
# 
# #aegypti Ago1 known control ranges
# Ago1_aegyptik_ctrlGR <-  GRanges(seqnames = Ago1_aegypti_ctrl_ranges$seqnames.x, strand = Ago1_aegypti_ctrl_ranges$strand.x,
#                              ranges = IRanges(start =Ago1_aegypti_ctrl_ranges$start.x, width =Ago1_aegypti_ctrl_ranges$width.x ))
# 
# ToPlot_ctrl_aegypti_kAgo1 <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptik_ctrlGR,format = "bigwig")
# ctrl_aegypti_kAgo1_peaks  <- plotRegion(ToPlot_ctrl_aegypti_kAgo1,outliers = 0.05)
# ctrl_aegypti_kAgo1_peaks <- ctrl_aegypti_kAgo1_peaks$data
# ctrl_aegypti_kAgo1_peaks$Sample <- c("ctrl_peaks") 
# 
# aegypti_Ago1kalt  <- rbind(ctrl_aegypti_kAgo1_peaks, aegypti_Ago1kalt )
# 
# #Aag2 Ago1 known peak ranges
# Ago1_Aag2k_GR <-  GRanges(seqnames = Ago1_Aag2_ranges$seqnames.x, strand = Ago1_Aag2_ranges$strand.x,
#                           ranges = IRanges(start = Ago1_Aag2_ranges$start.x, width =Ago1_Aag2_ranges$width.x ))
# 
# ToPlot_Aag2_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2k_GR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1kalt,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1kalt <- f$data
# 
# #Aag2 Ago1 known control ranges
# Ago1_Aag2k_ctrlGR <-  GRanges(seqnames = Ago1_Aag2_ctrl_ranges$seqnames.x, strand = Ago1_Aag2_ctrl_ranges$strand.x,
#                               ranges = IRanges(start = Ago1_Aag2_ctrl_ranges$start.x, width =Ago1_Aag2_ctrl_ranges$width.x ))
# 
# ToPlot_ctrl_Aag2_Ago1kalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2k_ctrlGR ,format = "bigwig")
# ctrlAag2_Ago1kalt_peaks <- plotRegion(ToPlot_ctrl_Aag2_Ago1kalt ,outliers = 0.05)
# ctrlAag2_Ago1kalt_peaks <- ctrlAag2_Ago1kalt_peaks$data
# ctrlAag2_Ago1kalt_peaks$Sample <- c("ctrl_peaks")
# 
# Aag2_Ago1kalt <- rbind(ctrlAag2_Ago1kalt_peaks, Aag2_Ago1kalt)
# #save(Aag2_Ago1kalt, aegypti_Ago1kalt, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/knownmiRNAs.RData")

output file “knownmiRNAs.RData” is available on my Github

Combine coverage with known miRNA pattern matches by positions and graph; Figures 5A and S4A

load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/knownmiRNAs.RData")
##now combine coverage with pattern matches by position and graph
nf <- c("Aag2_Ago1kalt", "aegypti_Ago1kalt")
norm_f <- do.call("list",mget(nf))
cov <- lapply(norm_f, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
covdf <- rbindlist(cov, idcol = TRUE)
covdf$xIndex <- covdf$xIndex - 1500
col.names <- colnames(covdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
covdf <- setNames(covdf, col.names)
cells_ago1 <- subset(covdf, covdf$ranges=="Aag2_Ago1kalt")
cells_ago1 <- subset(cells_ago1, cells_ago1$Sample=="ctrl_peaks" | cells_ago1$Sample=="cat_Aag2Ago1.bw")
aegypti_ago1 <- subset(covdf, covdf$ranges=="aegypti_Ago1kalt")
aegypti_ago1 <- subset(aegypti_ago1, aegypti_ago1$Sample=="ctrl_peaks" | aegypti_ago1$Sample=="cat_aegyptiAgo1.bw")

aegypti_ago1$merge <- ifelse(aegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(aegypti_ago1$position, "Ago1_aegypti_seed_known"), paste0(aegypti_ago1$position, "Ago1_aegypti_seed_known_ctrl"))

cells_ago1$merge <- ifelse(cells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(cells_ago1$position, "Ago1_Aag2_seed_known"), paste0(cells_ago1$position, "Ago1_Aag2_seed_known_ctrl"))

#get lists of pattern seed matches and then count all together for graphing 
Ago1known <- grep("_seed_known", names(.GlobalEnv),value=TRUE)
Ago1known <- grep("Ago1_", Ago1known, value=TRUE)
Ago1known <- do.call("list",mget(Ago1known))
Ago1known <- lapply(Ago1known, as.data.frame)
Ago1known <- lapply(Ago1known, setNames, nm="position")
Ago1known_counts <- lapply(Ago1known, function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1known_counts<- lapply(Ago1known_counts, function(x) subset(x, x$position <= 300 & x$position >=100))

#make master df by sample name
Ago1df <- rbindlist(Ago1known_counts, idcol = TRUE)
Ago1df$position <- Ago1df$position - 200
Aag2Ago1k <- subset(Ago1df, Ago1df$.id=="Ago1_Aag2_seed_known_ctrl" | Ago1df$.id=="Ago1_Aag2_seed_known")
Aag2Ago1k$merge <- paste0(Aag2Ago1k$position, Aag2Ago1k$.id)

aegyptiAgo1k <- subset(Ago1df, Ago1df$.id=="Ago1_aegypti_seed_known_ctrl" | Ago1df$.id=="Ago1_aegypti_seed_known")
aegyptiAgo1k$merge <- paste0(aegyptiAgo1k$position, aegyptiAgo1k$.id)

#merge coverage with pattern match counts by position
aegypti_ago1 <- merge(aegyptiAgo1k, aegypti_ago1, by="merge")
cells_ago1 <- merge(Aag2Ago1k, cells_ago1, by="merge")

aegypti_ago1 <- aegypti_ago1 %>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

aegypti_ago1 <- aegypti_ago1 %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

cells_ago1 <- cells_ago1 %>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

cells_ago1 <-cells_ago1 %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

#smooth with spline function
aegypti_ago1k  <-split(aegypti_ago1, aegypti_ago1$.id)
aegypti_ago1k  <- lapply(aegypti_ago1k, function(x) x[order(x$position.x),])
aegypti_ago1k_spline <-lapply(aegypti_ago1k, function(x) spline(x$freq_norm))
aegypti_ago1k_spline <- rbindlist(aegypti_ago1k_spline, idcol = TRUE)
aegypti_ago1k_spline$x <- aegypti_ago1k_spline$x - 101
col.names <- colnames(aegypti_ago1k_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1k_spline <- setNames(aegypti_ago1k_spline, col.names)

ggplot(aegypti_ago1k_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#3360A9", "#999999"))  + theme_bw()

cells_ago1k  <-split(cells_ago1, cells_ago1$.id)
cells_ago1k  <- lapply(cells_ago1k, function(x) x[order(x$position.x),])
cells_ago1k_spline <-lapply(cells_ago1k, function(x) spline(x$freq_norm))
cells_ago1k_spline <- rbindlist(cells_ago1k_spline, idcol = TRUE)
cells_ago1k_spline$x <- cells_ago1k_spline $x - 101
col.names <- colnames(cells_ago1k_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1k_spline <- setNames(cells_ago1k_spline, col.names)

ggplot(cells_ago1k_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#1B0B80", "#999999"))  + theme_bw()

Search top most abundant/targeting novel miRNA family 6mers (13 families) in mosquitoes

“aegypti_Ago1_novel_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github

#all peak ranges
res_Ago1_aegypti_novel <- annotatePeaksWithPatterns(peaks=allGR,
                                  fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
                                patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/aegypti_Ago1_novel_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 13 patterns
## Searching for 13 patterns in peaks....
## Search for ATGGTA
## Search for GACCTA
## Search for CGGAAT
## Search for TGGTCA
## Search for ACACCA
## Search for CCTCAC
## Search for CGGCCT
## Search for GGCCTA
## Search for TACGTA
## Search for CTACCT
## Search for ATGTCG
## Search for TATGGG
## Search for TCACTA
## ....finished search
res_Ago1_aegypti_novel <- res_Ago1_aegypti_novel[[1]]
res_Ago1_aegypti_novel <- rbindlist(res_Ago1_aegypti_novel)
res_Ago1_aegypti_novel$group <- paste0(res_Ago1_aegypti_novel$SeqExtended, "_", res_Ago1_aegypti_novel$siteOfPattern)
res_Ago1_aegypti_novel$startOfPmatch <- as.numeric(res_Ago1_aegypti_novel$startOfPmatch)
res_Ago1_aegypti_novel$filt <- duplicated(res_Ago1_aegypti_novel$group)
res_Ago1_aegypti_novel <- subset(res_Ago1_aegypti_novel, res_Ago1_aegypti_novel$filt == FALSE)

#control ranges
res_Ago1_aegypti_novel_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/aegypti_Ago1_novel_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 13 patterns
## Searching for 13 patterns in peaks....
## Search for ATGGTA
## Search for GACCTA
## Search for CGGAAT
## Search for TGGTCA
## Search for ACACCA
## Search for CCTCAC
## Search for CGGCCT
## Search for GGCCTA
## Search for TACGTA
## Search for CTACCT
## Search for ATGTCG
## Search for TATGGG
## Search for TCACTA
## ....finished search
res_Ago1_aegypti_novel_ctrl <- res_Ago1_aegypti_novel_ctrl[[1]]
res_Ago1_aegypti_novel_ctrl <- rbindlist(res_Ago1_aegypti_novel_ctrl)
res_Ago1_aegypti_novel_ctrl$group <- paste0(res_Ago1_aegypti_novel_ctrl$SeqExtended, "_", res_Ago1_aegypti_novel_ctrl$siteOfPattern)
res_Ago1_aegypti_novel_ctrl$startOfPmatch <- as.numeric(res_Ago1_aegypti_novel_ctrl$startOfPmatch)
res_Ago1_aegypti_novel_ctrl$filt <- duplicated(res_Ago1_aegypti_novel_ctrl$group)
res_Ago1_aegypti_novel_ctrl <- subset(res_Ago1_aegypti_novel_ctrl, res_Ago1_aegypti_novel_ctrl$filt == FALSE)

#get GRs with the same peak subset for calculating specific coverage
Ago1_aegypti_nranges <- subset(res_Ago1_aegypti_novel, res_Ago1_aegypti_novel$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel$aegypti_Ago2_BCsample ==0)
Ago1_aegypti_ctrl_nranges <- subset(res_Ago1_aegypti_novel_ctrl, res_Ago1_aegypti_novel_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel_ctrl$aegypti_Ago2_BCsample ==0)

#subset matched patterns in peaks with support from aegypti Ago1 reads
Ago1_aegypti_seed_novel_ctrl <- subset(res_Ago1_aegypti_novel_ctrl$startOfPmatch, res_Ago1_aegypti_novel_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel_ctrl$aegypti_Ago2_BCsample ==0)
Ago1_aegypti_seed_novel <- subset(res_Ago1_aegypti_novel$startOfPmatch, res_Ago1_aegypti_novel$aegypti_Ago1_BCsample > 0 & res_Ago1_aegypti_novel$aegypti_Ago2_BCsample ==0)

Search top most abundant/targeting novel miRNA family 6mers (15 families) in Aag2 cells

“cells_Ago1_novel_fam_rn.fa” file with 6mer target sequences of the most abundant miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github

#all peak ranges
res_Ago1_cells_novel <- annotatePeaksWithPatterns(peaks=allGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/cells_Ago1_novel_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 15 patterns
## Searching for 15 patterns in peaks....
## Search for AAATCC
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CCAATC
## Search for CGGAAT
## Search for TGGTCA
## Search for AATCCA
## Search for ATAAAT
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 4 out-of-bound ranges located on sequence
##   NIGP01000607. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## Search for TGCCGT
## Search for CTACCT
## Search for TATGGG
## ....finished search
res_Ago1_cells_novel <- res_Ago1_cells_novel[[1]]
res_Ago1_cells_novel <- rbindlist(res_Ago1_cells_novel)
res_Ago1_cells_novel$group <- paste0(res_Ago1_cells_novel$SeqExtended, "_", res_Ago1_cells_novel$siteOfPattern)
res_Ago1_cells_novel$startOfPmatch <- as.numeric(res_Ago1_cells_novel$startOfPmatch)
res_Ago1_cells_novel$filt <- duplicated(res_Ago1_cells_novel$group)
res_Ago1_cells_novel <- subset(res_Ago1_cells_novel, res_Ago1_cells_novel$filt == FALSE)

#control ranges
res_Ago1_cells_novel_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/cells_Ago1_novel_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 15 patterns
## Searching for 15 patterns in peaks....
## Search for AAATCC
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CCAATC
## Search for CGGAAT
## Search for TGGTCA
## Search for AATCCA
## Search for ATAAAT
## Search for TGCCGT
## Search for CTACCT
## Search for TATGGG
## ....finished search
res_Ago1_cells_novel_ctrl <- res_Ago1_cells_novel_ctrl[[1]]
res_Ago1_cells_novel_ctrl <- rbindlist(res_Ago1_cells_novel_ctrl)
res_Ago1_cells_novel_ctrl$group <- paste0(res_Ago1_cells_novel_ctrl$SeqExtended, "_", res_Ago1_cells_novel_ctrl$siteOfPattern)
res_Ago1_cells_novel_ctrl$startOfPmatch <- as.numeric(res_Ago1_cells_novel_ctrl$startOfPmatch)
res_Ago1_cells_novel_ctrl$filt <- duplicated(res_Ago1_cells_novel_ctrl$group)
res_Ago1_cells_novel_ctrl <- subset(res_Ago1_cells_novel_ctrl, res_Ago1_cells_novel_ctrl$filt == FALSE)


#get GRs with the same peak subset for calculating specific coverage
Ago1_Aag2_nranges <- subset(res_Ago1_cells_novel, res_Ago1_cells_novel$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel$Aag2_Ago2_BCsample == 0 )
Ago1_Aag2_ctrl_nranges <- subset(res_Ago1_cells_novel_ctrl, res_Ago1_cells_novel_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel_ctrl$Aag2_Ago2_BCsample == 0 )

#subset matched patterns in peaks with support from Aag2 Ago1 reads
Ago1_Aag2_seed_novel_ctrl <- subset(res_Ago1_cells_novel_ctrl$startOfPmatch, res_Ago1_cells_novel_ctrl$Aag2_Ago1_BCsample> 0 & res_Ago1_cells_novel_ctrl$Aag2_Ago2_BCsample ==0)
Ago1_Aag2_seed_novel <- subset(res_Ago1_cells_novel$startOfPmatch, res_Ago1_cells_novel$Aag2_Ago1_BCsample > 0 & res_Ago1_cells_novel$Aag2_Ago2_BCsample == 0 )

Get coverage by position around peaks containing patterns; novel miRNAs

# #aegypti Ago1 novel peak ranges 
# Ago1_aegyptin_GR <-  GRanges(seqnames = Ago1_aegypti_nranges$seqnames.x, strand = Ago1_aegypti_nranges$strand.x, ranges = IRanges(start = Ago1_aegypti_nranges$start.x, width =Ago1_aegypti_nranges$width.x ))
# 
# toPlot_aegypti_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptin_GR ,format = "bigwig")
# f <- plotRegion(toPlot_aegypti_Ago1nalt,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1nalt <- f$data
# 
# #aegypti Ago1 novel control ranges 
# Ago1_aegyptin_ctrlGR <-  GRanges(seqnames = Ago1_aegypti_ctrl_nranges$seqnames.x, strand = Ago1_aegypti_ctrl_nranges$strand.x,ranges = IRanges(start = Ago1_aegypti_ctrl_nranges$start.x, width =Ago1_aegypti_ctrl_nranges$width.x ))
# 
# ToPlot_ctrl_aegypti_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegyptin_ctrlGR,format = "bigwig")
# ctrlaegypti_Ago1nalt_peaks <- plotRegion(ToPlot_ctrl_aegypti_Ago1nalt ,outliers = 0.05)
# ctrlaegypti_Ago1nalt_peaks <- ctrlaegypti_Ago1nalt_peaks$data
# ctrlaegypti_Ago1nalt_peaks$Sample <- c("ctrl_peaks")
# 
# aegypti_Ago1nalt <- rbind(ctrlaegypti_Ago1nalt_peaks, aegypti_Ago1nalt)
# 
# #Aag2 Ago1 novel peak ranges 
# Ago1_Aag2n_GR <-  GRanges(seqnames = Ago1_Aag2_nranges$seqnames.x, strand = Ago1_Aag2_nranges$strand.x,
# ranges = IRanges(start = Ago1_Aag2_nranges$start.x, width =Ago1_Aag2_nranges$width.x ))
# 
# ToPlot_Aag2_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2n_GR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1nalt,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1nalt <- f$data
# 
# #Aag2 Ago1 novel control ranges 
# 
# Ago1_Aag2n_ctrlGR <-  GRanges(seqnames = Ago1_Aag2_ctrl_nranges$seqnames.x, strand = Ago1_Aag2_ctrl_nranges$strand.x,ranges = IRanges(start = Ago1_Aag2_ctrl_nranges$start.x, width =Ago1_Aag2_ctrl_nranges$width.x ))
# 
# ToPlot_ctrl_Aag2_Ago1nalt <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2n_ctrlGR ,format = "bigwig")
# ctrlAag2_Ago1nalt_peaks <- plotRegion(ToPlot_ctrl_Aag2_Ago1nalt ,outliers = 0.05)
# ctrlAag2_Ago1nalt_peaks <- ctrlAag2_Ago1nalt_peaks$data
# ctrlAag2_Ago1nalt_peaks$Sample <- c("ctrl_peaks")
# 
# Aag2_Ago1nalt <- rbind(ctrlAag2_Ago1nalt_peaks, Aag2_Ago1nalt)
# 
# #save(Aag2_Ago1nalt, aegypti_Ago1nalt, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/novelmiRNAs.RData")

output file “novelmiRNAs.RData” is available on my Github

Combine coverage with novel miRNA pattern matches by positions and graph; Figures 5B and S4B

load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/novelmiRNAs.RData")
nf <- c("Aag2_Ago1nalt", "aegypti_Ago1nalt")
norm_f <- do.call("list",mget(nf))
cov <- lapply(norm_f, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
covdf <- rbindlist(cov, idcol = TRUE)
covdf$xIndex <- covdf$xIndex - 1500
col.names <- colnames(covdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
covdf <- setNames(covdf, col.names)

cells_ago1 <- subset(covdf, covdf$ranges=="Aag2_Ago1nalt")
aegypti_ago1 <- subset(covdf, covdf$ranges=="aegypti_Ago1nalt")

cells_ago1$merge <- ifelse(cells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(cells_ago1$position, "Ago1_Aag2_seed_novel"), paste0(cells_ago1$position, "Ago1_Aag2_seed_novel_ctrl"))

aegypti_ago1$merge <- ifelse(aegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(aegypti_ago1$position, "Ago1_aegypti_seed_novel"), paste0(aegypti_ago1$position, "Ago1_aegypti_seed_novel_ctrl"))

#make list of pat matches and then count all together for graphing 
Ago1novel <- grep("_seed_novel", names(.GlobalEnv),value=TRUE)
Ago1novel <- grep("Ago1_", Ago1novel, value=TRUE)
Ago1novel <- do.call("list",mget(Ago1novel))
Ago1novel <- lapply(Ago1novel, as.data.frame)
Ago1novel <- lapply(Ago1novel, setNames, nm="position")
Ago1novel_counts <- lapply(Ago1novel, function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1novel_counts <- lapply(Ago1novel_counts, function(x) subset(x, x$position <= 300 & x$position >=100))

#make master df by sample name
Ago1noveldf <- rbindlist(Ago1novel_counts, idcol = TRUE)
Ago1noveldf$position <- Ago1noveldf$position - 200
Aag2Ago1n <- subset(Ago1noveldf, Ago1noveldf$.id=="Ago1_Aag2_seed_novel_ctrl" | Ago1noveldf$.id=="Ago1_Aag2_seed_novel")
Aag2Ago1n$merge <- paste0(Aag2Ago1n$position,Aag2Ago1n$.id)

aegyptiAgo1n <- subset(Ago1noveldf, Ago1noveldf$.id=="Ago1_aegypti_seed_novel_ctrl" | Ago1noveldf$.id=="Ago1_aegypti_seed_novel")
aegyptiAgo1n$merge <- paste0(aegyptiAgo1n$position, aegyptiAgo1n$.id)

aegypti_ago1 <- merge(aegyptiAgo1n, aegypti_ago1, by="merge")
cells_ago1 <- merge(Aag2Ago1n, cells_ago1, by="merge")

aegypti_ago1 <- aegypti_ago1 %>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

aegypti_ago1 <- aegypti_ago1 %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

aegypti_ago1n  <-split(aegypti_ago1, aegypti_ago1$.id)
aegypti_ago1n  <- lapply(aegypti_ago1n, function(x) x[order(x$position.x),])
aegypti_ago1n_spline <-lapply(aegypti_ago1n, function(x) spline(x$freq_norm))
aegypti_ago1n_spline <- rbindlist(aegypti_ago1n_spline, idcol = TRUE)
aegypti_ago1n_spline$x <- aegypti_ago1n_spline$x - 101
col.names <- colnames(aegypti_ago1n_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1n_spline <- setNames(aegypti_ago1n_spline, col.names)

ggplot(aegypti_ago1n_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#3360A9", "#999999")) + theme_bw() 

cells_ago1 <- cells_ago1 %>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

cells_ago1 <-cells_ago1 %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

cells_ago1n <-split(cells_ago1, cells_ago1$.id)
cells_ago1n <- lapply(cells_ago1n, function(x) x[order(x$position.x),])
cells_ago1n_spline <-lapply(cells_ago1n, function(x) spline(x$freq_norm))
cells_ago1n_spline <- rbindlist(cells_ago1n_spline, idcol = TRUE)
cells_ago1n_spline$x <- cells_ago1n_spline $x - 101
col.names <- colnames(cells_ago1n_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1n_spline <- setNames(cells_ago1n_spline, col.names)

ggplot(cells_ago1n_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#1B0B80", "#999999")) + theme_bw()

Search for known miRNA families have similar abundance to top novel miRNAs in mosquitoes

“known_miRNAs_novelab_mos_fam_rn.fa” file with 6mer target sequences of known miRNAs of similar abundance as the most abundant novel miRNAs in mosquitoes was generated in smallRNA_abundance_scatterplots script and is available in my Github

 #all peak ranges
res_Ago1_av_known <- annotatePeaksWithPatterns(peaks=allGR,
                                  fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
                                patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_mos_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 6 patterns
## Searching for 6 patterns in peaks....
## Search for GATATT
## Search for GCACAA
## Search for TAAACC
## Search for TACCTC
## Search for TCAGGG
## Search for TGCCTT
## ....finished search
res_Ago1_av_known <- res_Ago1_av_known[[1]]
res_Ago1_av_known <- rbindlist(res_Ago1_av_known)
res_Ago1_av_known$group <- paste0(res_Ago1_av_known$SeqExtended, "_", res_Ago1_av_known$siteOfPattern)
res_Ago1_av_known$startOfPmatch <- as.numeric(res_Ago1_av_known$startOfPmatch)
res_Ago1_av_known$filt <- duplicated(res_Ago1_av_known$group)
res_Ago1_av_known <- subset(res_Ago1_av_known, res_Ago1_av_known$filt == FALSE)

#control ranges
res_Ago1_av_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_mos_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 6 patterns
## Searching for 6 patterns in peaks....
## Search for GATATT
## Search for GCACAA
## Search for TAAACC
## Search for TACCTC
## Search for TCAGGG
## Search for TGCCTT
## ....finished search
res_Ago1_av_known_ctrl <- res_Ago1_av_known_ctrl[[1]]
res_Ago1_av_known_ctrl <- rbindlist(res_Ago1_av_known_ctrl)
res_Ago1_av_known_ctrl$group <- paste0(res_Ago1_av_known_ctrl$SeqExtended, "_", res_Ago1_av_known_ctrl$siteOfPattern)
res_Ago1_av_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_av_known_ctrl$startOfPmatch)
res_Ago1_av_known_ctrl$filt <- duplicated(res_Ago1_av_known_ctrl$group)
res_Ago1_av_known_ctrl <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$filt == FALSE)

#subset matched patterns in peaks with support from aegypti Ago1 reads 
res_Ago1_av_aegypti_known_ctrl <- subset(res_Ago1_av_known_ctrl$startOfPmatch, res_Ago1_av_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$aegypti_Ago2_BCsample == 0)
res_Ago1_av_aegypti_known <- subset(res_Ago1_av_known$startOfPmatch, res_Ago1_av_known$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known$aegypti_Ago2_BCsample == 0)

#get GRs with the same peak subset for calculating specific coverage
res_Ago1_av_aegypti_known_ctrl_pats <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$aegypti_Ago2_BCsample == 0)
res_Ago1_av_aegypti_known_pats <- subset(res_Ago1_av_known, res_Ago1_av_known$aegypti_Ago1_BCsample > 0 & res_Ago1_av_known$aegypti_Ago2_BCsample == 0)

Search for known miRNA families have similar abundance to top novel miRNAs in Aag2 cells

“known_miRNAs_novelab_cells_fam_rn.fa” file with 6mer target sequences of known miRNAs of similar abundance as the most abundant novel miRNAs in cells was generated in smallRNA_abundance_scatterplots script and is available in my Github

#all peak ranges
res_Ago1_av_known <- annotatePeaksWithPatterns(peaks=allGR,
                                  fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
                                patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_cells_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 4 patterns
## Searching for 4 patterns in peaks....
## Search for ACGCTT
## Search for AGTATT
## Search for AGTGAG
## Search for TGGTAA
## ....finished search
res_Ago1_av_known <- res_Ago1_av_known[[1]]
res_Ago1_av_known <- rbindlist(res_Ago1_av_known)
res_Ago1_av_known$group <- paste0(res_Ago1_av_known$SeqExtended, "_", res_Ago1_av_known$siteOfPattern)
res_Ago1_av_known$startOfPmatch <- as.numeric(res_Ago1_av_known$startOfPmatch)
res_Ago1_av_known$filt <- duplicated(res_Ago1_av_known$group)
res_Ago1_av_known <- subset(res_Ago1_av_known, res_Ago1_av_known$filt == FALSE)


#control ranges
res_Ago1_av_known_ctrl <- annotatePeaksWithPatterns(peaks=allctrlGR,
                              fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_miRNAs_novelab_cells_fam_rn.fa",
                                  resize=406,
                                  add5=1,
                                  add3=1,
                                  verbose=TRUE)
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 457 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000110, NIGP01000126,
##   NIGP01000140, NIGP01000150, NIGP01000152, NIGP01000158,
##   NIGP01000161, NIGP01000162, NIGP01000170, NIGP01000174,
##   NIGP01000179, NIGP01000184, NIGP01000217, NIGP01000247,
##   NIGP01000251, NIGP01000252, NIGP01000253, NIGP01000256,
##   NIGP01000258, NIGP01000309, NIGP01000313, NIGP01000315,
##   NIGP01000322, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000988, NIGP01001000,
##   NIGP01001016, NIGP01001020, NIGP01001034, NIGP01001045,
##   NIGP01001064, NIGP01001078, NIGP01001080, NIGP01001082,
##   NIGP01001101, NIGP01001106, NIGP01001132, NIGP01001139,
##   NIGP01001150, NIGP01001159, NIGP01001164, NIGP01001180,
##   NIGP01001185, NIGP01001193, NIGP01001217, NIGP01001220,
##   NIGP01001223, NIGP01001224, NIGP01001235, NIGP01001267,
##   NIGP01001275, NIGP01001280, NIGP01001306, NIGP01001308,
##   NIGP01001315, NIGP01001330, NIGP01001336, NIGP01001341,
##   NIGP01001383, NIGP01001401, NIGP01001414, NIGP01001436,
##   NIGP01001444, NIGP01001447, NIGP01001457, NIGP01001462,
##   NIGP01001463, NIGP01001507, NIGP01001511, NIGP01001526,
##   NIGP01001552, NIGP01001558, NIGP01001568, NIGP01001574,
##   NIGP01001582, NIGP01001625, NIGP01001629, NIGP01001637,
##   NIGP01001645, NIGP01001658, NIGP01001691, NIGP01001694,
##   NIGP01001703, NIGP01001710, NIGP01001711, NIGP01001740,
##   NIGP01001744, NIGP01001746, NIGP01001751, NIGP01001760,
##   NIGP01001768, NIGP01001773, NIGP01001776, NIGP01001782,
##   NIGP01001787, NIGP01001804, NIGP01001806, NIGP01001810,
##   NIGP01001814, NIGP01001829, NIGP01001831, NIGP01001836,
##   NIGP01001857, NIGP01001867, NIGP01001874, NIGP01001900,
##   NIGP01001905, NIGP01001918, NIGP01001923, NIGP01001936,
##   NIGP01001945, NIGP01001949, NIGP01001990, NIGP01002001,
##   NIGP01002026, NIGP01002028, NIGP01002052, NIGP01002064,
##   NIGP01002068, NIGP01002069, NIGP01002071, NIGP01002135,
##   NIGP01002153, NIGP01002155, NIGP01002167, NIGP01002170,
##   NIGP01002182, NIGP01002208, NIGP01002209, NIGP01002223,
##   NIGP01002227, NIGP01002234, NIGP01002244, NIGP01002252,
##   NIGP01002255, NIGP01002257, NIGP01002260, NIGP01002287, and
##   NIGP01002295. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## ...done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 482 out-of-bound ranges located on
##   sequences MT, NIGP01000012, NIGP01000013, NIGP01000016,
##   NIGP01000030, NIGP01000042, NIGP01000051, NIGP01000068,
##   NIGP01000082, NIGP01000098, NIGP01000106, NIGP01000110,
##   NIGP01000126, NIGP01000128, NIGP01000140, NIGP01000150,
##   NIGP01000152, NIGP01000158, NIGP01000161, NIGP01000162,
##   NIGP01000170, NIGP01000174, NIGP01000179, NIGP01000184,
##   NIGP01000217, NIGP01000234, NIGP01000247, NIGP01000251,
##   NIGP01000252, NIGP01000253, NIGP01000256, NIGP01000258,
##   NIGP01000309, NIGP01000313, NIGP01000315, NIGP01000322,
##   NIGP01000326, NIGP01000332, NIGP01000351, NIGP01000355,
##   NIGP01000358, NIGP01000374, NIGP01000397, NIGP01000413,
##   NIGP01000414, NIGP01000416, NIGP01000423, NIGP01000427,
##   NIGP01000437, NIGP01000443, NIGP01000445, NIGP01000460,
##   NIGP01000473, NIGP01000484, NIGP01000489, NIGP01000525,
##   NIGP01000532, NIGP01000550, NIGP01000582, NIGP01000584,
##   NIGP01000588, NIGP01000591, NIGP01000603, NIGP01000607,
##   NIGP01000610, NIGP01000612, NIGP01000622, NIGP01000656,
##   NIGP01000658, NIGP01000675, NIGP01000676, NIGP01000692,
##   NIGP01000703, NIGP01000706, NIGP01000714, NIGP01000720,
##   NIGP01000723, NIGP01000731, NIGP01000737, NIGP01000739,
##   NIGP01000750, NIGP01000763, NIGP01000764, NIGP01000797,
##   NIGP01000827, NIGP01000829, NIGP01000831, NIGP01000836,
##   NIGP01000842, NIGP01000866, NIGP01000870, NIGP01000880,
##   NIGP01000892, NIGP01000904, NIGP01000928, NIGP01000932,
##   NIGP01000950, NIGP01000978, NIGP01000980, NIGP01000988,
##   NIGP01001000, NIGP01001016, NIGP01001020, NIGP01001034,
##   NIGP01001045, NIGP01001064, NIGP01001078, NIGP01001080,
##   NIGP01001082, NIGP01001101, NIGP01001106, NIGP01001132,
##   NIGP01001139, NIGP01001150, NIGP01001159, NIGP01001164,
##   NIGP01001180, NIGP01001185, NIGP01001193, NIGP01001217,
##   NIGP01001220, NIGP01001223, NIGP01001224, NIGP01001235,
##   NIGP01001267, NIGP01001275, NIGP01001280, NIGP01001306,
##   NIGP01001308, NIGP01001315, NIGP01001330, NIGP01001336,
##   NIGP01001341, NIGP01001383, NIGP01001401, NIGP01001414,
##   NIGP01001416, NIGP01001436, NIGP01001444, NIGP01001447,
##   NIGP01001457, NIGP01001462, NIGP01001463, NIGP01001507,
##   NIGP01001511, NIGP01001526, NIGP01001552, NIGP01001558,
##   NIGP01001568, NIGP01001574, NIGP01001582, NIGP01001625,
##   NIGP01001629, NIGP01001637, NIGP01001645, NIGP01001658,
##   NIGP01001691, NIGP01001694, NIGP01001703, NIGP01001710,
##   NIGP01001711, NIGP01001740, NIGP01001744, NIGP01001746,
##   NIGP01001751, NIGP01001760, NIGP01001768, NIGP01001773,
##   NIGP01001776, NIGP01001782, NIGP01001787, NIGP01001804,
##   NIGP01001806, NIGP01001810, NIGP01001814, NIGP01001829,
##   NIGP01001831, NIGP01001836, NIGP01001857, NIGP01001867,
##   NIGP01001874, NIGP01001900, NIGP01001905, NIGP01001918,
##   NIGP01001923, NIGP01001936, NIGP01001945, NIGP01001949,
##   NIGP01001990, NIGP01002001, NIGP01002026, NIGP01002028,
##   NIGP01002052, NIGP01002064, NIGP01002068, NIGP01002069,
##   NIGP01002071, NIGP01002135, NIGP01002153, NIGP01002155,
##   NIGP01002167, NIGP01002170, NIGP01002182, NIGP01002208,
##   NIGP01002209, NIGP01002223, NIGP01002227, NIGP01002234,
##   NIGP01002244, NIGP01002252, NIGP01002255, NIGP01002257,
##   NIGP01002260, NIGP01002287, and NIGP01002295. Note that ranges
##   located on a sequence whose length is unknown (NA) or on a
##   circular sequence are not considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 4 patterns
## Searching for 4 patterns in peaks....
## Search for ACGCTT
## Search for AGTATT
## Search for AGTGAG
## Search for TGGTAA
## ....finished search
res_Ago1_av_known_ctrl <- res_Ago1_av_known_ctrl[[1]]
res_Ago1_av_known_ctrl <- rbindlist(res_Ago1_av_known_ctrl)
res_Ago1_av_known_ctrl$group <- paste0(res_Ago1_av_known_ctrl$SeqExtended, "_", res_Ago1_av_known_ctrl$siteOfPattern)
res_Ago1_av_known_ctrl$startOfPmatch <- as.numeric(res_Ago1_av_known_ctrl$startOfPmatch)
res_Ago1_av_known_ctrl$filt <- duplicated(res_Ago1_av_known_ctrl$group)
res_Ago1_av_known_ctrl <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$filt == FALSE)

#subset matched patterns in peaks with support from Aag2 Ago1 reads 
res_Ago1_av_cells_known_ctrl <- subset(res_Ago1_av_known_ctrl$startOfPmatch, res_Ago1_av_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$Aag2_Ago2_BCsample == 0)
res_Ago1_av_cells_known <- subset(res_Ago1_av_known$startOfPmatch, res_Ago1_av_known$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known$Aag2_Ago2_BCsample == 0)

#get GRs with the same peak subset for calculating specific coverage
res_Ago1_av_cells_known_ctrl_pats <- subset(res_Ago1_av_known_ctrl, res_Ago1_av_known_ctrl$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known_ctrl$Aag2_Ago2_BCsample == 0)
res_Ago1_av_cells_known_pats <- subset(res_Ago1_av_known, res_Ago1_av_known$Aag2_Ago1_BCsample > 0 & res_Ago1_av_known$Aag2_Ago2_BCsample == 0)

Get coverage by position around peaks containing patterns; known miRNAs with similar abundance to top novel miRNAs

# #aegypti Ago1 known (novel ab) peak ranges
# Ago1_aegypti_avk_GR <-  GRanges(seqnames = res_Ago1_av_aegypti_known_pats$seqnames.x, strand = res_Ago1_av_aegypti_known_pats$strand.x, ranges = IRanges(start = res_Ago1_av_aegypti_known_pats$start.x, width =res_Ago1_av_aegypti_known_pats$width.x ))
# 
# ToPlot_ctrl_Ago1_aegypti_avk <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegypti_avk_GR,format = "bigwig")
# aegypti_Ago1_avk <- plotRegion(ToPlot_ctrl_Ago1_aegypti_avk  ,outliers = 0.05)
# aegypti_Ago1_avk <- aegypti_Ago1_avk$data
# 
# #aegypti Ago1 known (novel ab) control ranges 
# Ago1_aegypti_avk_ctrlGR <-  GRanges(seqnames = res_Ago1_av_aegypti_known_ctrl_pats$seqnames.x, strand = res_Ago1_av_aegypti_known_ctrl_pats$strand.x,ranges = IRanges(start = res_Ago1_av_aegypti_known_ctrl_pats$start.x, width =res_Ago1_av_aegypti_known_ctrl_pats$width.x ))
# 
# toPlot_aegypti_avk_ctrl <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_aegyptiAgo1.bw",Ago1_aegypti_avk_ctrlGR ,format = "bigwig")
# aegypti_Ago1_avk_ctrl <- plotRegion(toPlot_aegypti_avk_ctrl,outliers = 0.05,groupBy = "Sample")
# aegypti_Ago1_avk_ctrl <- aegypti_Ago1_avk_ctrl$data
# aegypti_Ago1_avk_ctrl$Sample <- c("ctrl_peaks")
# 
# aegypti_Ago1_avk <- rbind(aegypti_Ago1_avk, aegypti_Ago1_avk_ctrl)
# 
# #Aag2 Ago1 known (novel ab) peak ranges
# Ago1_Aag2_avk_GR <-  GRanges(seqnames = res_Ago1_av_cells_known_pats$seqnames.x, strand = res_Ago1_av_cells_known_pats$strand.x, ranges = IRanges(start = res_Ago1_av_cells_known_pats$start.x, width =res_Ago1_av_cells_known_pats$width.x ))
# 
# ToPlot_Aag2_Ago1_avk <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2_avk_GR ,format = "bigwig")
# Aag2_Ago1_avk <- plotRegion(ToPlot_Aag2_Ago1_avk ,outliers = 0.05)
# Aag2_Ago1_avk <- Aag2_Ago1_avk$data
# 
# #Aag2 Ago1 known (novel ab) control ranges
# Ago1_Aag2_avk_ctrlGR <-  GRanges(seqnames = res_Ago1_av_cells_known_ctrl_pats$seqnames.x, strand = res_Ago1_av_cells_known_ctrl_pats$strand.x, ranges = IRanges(start = res_Ago1_av_cells_known_ctrl_pats$start.x, width =res_Ago1_av_cells_known_ctrl_pats$width.x ))
# 
# ToPlot_Aag2_Ago1_avk_ctrl <- regionPlot(bamFile = "/Users/kathryn/Reprocess_all_paper_datasets/AaegL5_mapped/cat_Aag2Ago1.bw",Ago1_Aag2_avk_ctrlGR ,format = "bigwig")
# f <- plotRegion(ToPlot_Aag2_Ago1_avk_ctrl,outliers = 0.05,groupBy = "Sample")
# Aag2_Ago1_avk_ctrl <- f$data
# Aag2_Ago1_avk_ctrl$Sample <- c("ctrl_peaks")
# 
# Aag2_Ago1_avk <- rbind(Aag2_Ago1_avk_ctrl, Aag2_Ago1_avk)
# 
# #save(Aag2_Ago1_avk, aegypti_Ago1_avk, file="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_novelabmiRNAs.RData")

output file “known_novelabmiRNAs.RData” is available on my Github

Combine coverage with known (novel abundance) miRNA pattern matches by positions and graph; Figures S4C and S4D

load("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/known_novelabmiRNAs.RData")
Ago1_av_known <- grep("res_Ago1_av_a", names(.GlobalEnv),value=TRUE)
Ago1_av_known <- c(Ago1_av_known, grep("res_Ago1_av_c", names(.GlobalEnv),value=TRUE))
Ago1_av_known <- grep("pats", Ago1_av_known,value=TRUE, invert = TRUE)

Ago1_av_known  <- do.call("list",mget(Ago1_av_known ))
Ago1_av_known  <- lapply(Ago1_av_known , as.data.frame)
Ago1_av_known  <- lapply(Ago1_av_known , setNames, nm="position")
Ago1_av_known_counts <- lapply(Ago1_av_known , function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago1_av_known_counts <- lapply(Ago1_av_known_counts, function(x) subset(x, x$position <= 300 & x$position >=100))

Ago1avdf <- rbindlist(Ago1_av_known_counts, idcol = TRUE)
Ago1avdf$position <- Ago1avdf$position - 200
Aag2Ago1av <- subset(Ago1avdf, Ago1avdf$.id=="res_Ago1_av_cells_known_ctrl" | Ago1avdf$.id=="res_Ago1_av_cells_known")
Aag2Ago1av$merge <- paste0(Aag2Ago1av$position,Aag2Ago1av$.id)

aegyptiAgo1av <- subset(Ago1avdf, Ago1avdf$.id=="res_Ago1_av_aegypti_known_ctrl" | Ago1avdf$.id=="res_Ago1_av_aegypti_known")
aegyptiAgo1av$merge <- paste0(aegyptiAgo1av$position, aegyptiAgo1av$.id)

av <- c("Aag2_Ago1_avk", "aegypti_Ago1_avk")
norm_av <- do.call("list",mget(av))
avcov <- lapply(norm_av, function(x) subset(x, x$xIndex >= 1400 & x$xIndex <=1600))
avcovdf <- rbindlist(avcov, idcol = TRUE)
avcovdf$xIndex <- avcovdf$xIndex - 1500
col.names <- colnames(avcovdf)
col.names <- gsub(".id", "ranges", col.names)
col.names <- gsub("xIndex", "position", col.names)
avcovdf <- setNames(avcovdf, col.names)
avcells_ago1 <- subset(avcovdf, avcovdf$ranges=="Aag2_Ago1_avk")
avaegypti_ago1 <- subset(avcovdf, avcovdf$ranges=="aegypti_Ago1_avk")

avaegypti_ago1$merge <- ifelse(avaegypti_ago1$Sample=="cat_aegyptiAgo1.bw" , paste0(avaegypti_ago1$position, "res_Ago1_av_aegypti_known"), paste0(avaegypti_ago1$position, "res_Ago1_av_aegypti_known_ctrl"))

avcells_ago1$merge <- ifelse(avcells_ago1$Sample=="cat_Aag2Ago1.bw" , paste0(avcells_ago1$position, "res_Ago1_av_cells_known"), paste0(avcells_ago1$position, "res_Ago1_av_cells_known_ctrl"))

aegyptiAgo1av <- merge(aegyptiAgo1av, avaegypti_ago1, by="merge")
Aag2Ago1av <- merge(Aag2Ago1av, avcells_ago1, by="merge")

aegypti_ago1av <- aegyptiAgo1av  %>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

aegypti_ago1av  <- aegypti_ago1av %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

cells_ago1av <- Aag2Ago1av%>%
  group_by(Sample) %>%
  mutate( counts_norm = (count*Score))

cells_ago1av <-cells_ago1av %>%
  group_by(Sample) %>%
  mutate(freq_norm = counts_norm/(sum(counts_norm)))

aegypti_ago1av  <-split(aegypti_ago1av, aegypti_ago1av$.id)
aegypti_ago1av  <- lapply(aegypti_ago1av, function(x) x[order(x$position.x),])
aegypti_ago1av_spline <-lapply(aegypti_ago1av, function(x) spline(x$freq_norm))
aegypti_ago1av_spline <- rbindlist(aegypti_ago1av_spline, idcol = TRUE)
aegypti_ago1av_spline$x <- aegypti_ago1av_spline$x - 101
col.names <- colnames(aegypti_ago1av_spline)
col.names <- gsub(".id", "sample", col.names)
aegypti_ago1av_spline <- setNames(aegypti_ago1av_spline, col.names)

ggplot(aegypti_ago1av_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#3360A9", "#999999")) + theme_bw()

cells_ago1av <-split(cells_ago1av, cells_ago1av$.id)
cells_ago1av <- lapply(cells_ago1av, function(x) x[order(x$position.x),])
cells_ago1av_spline <-lapply(cells_ago1av, function(x) spline(x$freq_norm))
cells_ago1av_spline <- rbindlist(cells_ago1av_spline, idcol = TRUE)
cells_ago1av_spline$x <- cells_ago1av_spline $x - 101
col.names <- colnames(cells_ago1av_spline)
col.names <- gsub(".id", "sample", col.names)
cells_ago1av_spline <- setNames(cells_ago1av_spline, col.names)

ggplot(cells_ago1av_spline , aes(x = x, y = y)) + 
  geom_line(aes(color = sample)) + 
  scale_color_manual(values = c("#1B0B80", "#999999")) + theme_bw()