Additionally, see CLIPflexR documentation for:
fetchSequencesForCLIP
annotatePeaksWithPatterns
fetchSequencesForClIP <- function(peaks,reSize=NULL,fasta,add5=0,add3=0){
swd <- Rsamtools::FaFile(fasta)
fastaLens <- swd %>% seqlengths
start(peaks) <- start(peaks)-add5
end(peaks) <- end(peaks)+add3
sees <- seqlevels(peaks)
seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
if(!is.null(reSize)){
resizePeaks <- resize(peaks, reSize, fix="center", use.names=TRUE)
}else{
resizePeaks <- peaks
}
names(resizePeaks) <- paste0(seqnames(resizePeaks),":",start(resizePeaks),"-",end(resizePeaks))
#temp <- subsetByOverlaps(resizePeaks,Boundaries,type=c("within"))
resizePeaks <- trim(resizePeaks)##added this line cause off error when extending going ut of bounds
temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
temp2 <- Rsamtools::getSeq(Rsamtools:::FaFile(fasta),resizePeaks[temp@from])
names(temp2) <- names(resizePeaks[temp@from])
temp2
}
annotatePeaksWithPatterns <- function(peaks,fasta,patterns,resize=64,add5=0,add3=0,verbose=FALSE,checkReverse=TRUE){
require(GenomicRanges)
require(magrittr)
require(Biostrings)
require(Rsamtools)
if(verbose) message("Reading in peaks....",appendLF = FALSE)
if(is(peaks,"GRanges")){
peaks <- peaks
peaks$originalPeak <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
}else if(is(peaks,"character")){
if(!file.exists(peaks))stop(paste0("The file ",peaks," does not exist"))
peaks <- read.table(peaks, header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks,
keep.extra.columns=TRUE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
peaks$originalPeak <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), ":", strand(peaks))
}
if(verbose) message("...done")
if(verbose) message("Read in ",length(peaks)," peaks")
if(!file.exists(fasta))stop(paste0("The file ",fasta," does not exist"))
if(verbose) message("Indexing FASTA...",appendLF = FALSE)
indexFa(fasta)
if(verbose) message("...done")
if(verbose) message("Aligning seqlevels across peaks and FASTA...",appendLF = FALSE)
fastaLens <- FaFile(fasta) %>% seqlengths
sees <- seqlevels(peaks)
seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
if(verbose) message("...done")
if(verbose) message("Removing invalid peaks which are outside contig boundaries...",appendLF = FALSE)
Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
resizePeaks <- resize(peaks, resize, fix="center", use.names=TRUE)
resizePeaks <- trim(resizePeaks)
names(resizePeaks) <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
validPeaks <- resizePeaks[temp@from]
validPeaks$extendedPeak <- paste0(seqnames(validPeaks),":",start(validPeaks),"_",end(validPeaks), "_", strand(validPeaks))
if(verbose) message("...done")
if(verbose) message("Removed in ",length(validPeaks)-length(resizePeaks)," peaks")
if(verbose) message("Retrieving sequence from under peaks....",appendLF = FALSE)
myRes <- getSeq(Rsamtools:::FaFile(fasta),validPeaks)
names(myRes) <- validPeaks$extendedPeak
validPeaks$seqUnderExtendedPeak <- myRes
if(verbose) message("...done")
if(verbose) message("Retrieving patterns to search for....",appendLF = FALSE)
if(length(patterns) > 1){
pattern=patterns
}else{
motifSeq <- readDNAStringSet(patterns)
pattern <- as.character(motifSeq)
}
if(verbose) message("...done")
if(verbose) message("Read in ",length(pattern)," patterns")
if(verbose) message("Searching for ",length(pattern)," patterns in peaks....")
mergePeaksAndSites <- patternCallList <- list()
for(i in 1:length(pattern)){
if(verbose) message("Search for ",pattern[i])
peaks_Sites <- validPeaks
fixedInRegion <- vmatchPattern(pattern=pattern[i],subject=myRes) %>% unlist()
startOfPmatch <- resize(fixedInRegion,width=1,fix="start") %>% unname %>% as.character
pos <- matrix(unlist(strsplit(gsub(".*:","",names(fixedInRegion)),"_")),ncol=3,byrow =T)
seq <- gsub(":.*","",names(fixedInRegion))
sitesGRpos <- GRanges(seq[pos[,3] =="+"],
IRanges(as.numeric(pos[pos[,3] =="+",1])+start(fixedInRegion[pos[,3] =="+"])-1,
as.numeric(pos[pos[,3] =="+",1])+end(fixedInRegion[pos[,3] =="+"])-1),startOfPmatch=startOfPmatch[pos[,3] =="+"])
sitesGRpos$PeakID <- names(fixedInRegion[pos[,3] =="+"])
sitesGRneg <- GRanges(seq[pos[,3] =="-"],
IRanges(as.numeric(pos[pos[,3] =="-",2])-end(fixedInRegion[pos[,3] =="-"])+1,
as.numeric(pos[pos[,3] =="-",2])-start(fixedInRegion[pos[,3] =="-"])+1),startOfPmatch=startOfPmatch[pos[,3] =="-"])
sitesGRneg$PeakID <- names(fixedInRegion[pos[,3] =="-"])
sitesFA <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta),reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta)))
sitesFAExtended <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta,add5=add5,add3=add3),
reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta,add5=add5,add3=add3))
)
sitesGR <- suppressWarnings(c(sitesGRpos,sitesGRneg))
sitesGR$Seq <- sitesFA
sitesGR$SeqExtended <- sitesFAExtended
sitesGR$siteOfPattern <- granges(sitesGR) %>% unname %>% as.character
sitesGR$centerOfPattern <- resize(sitesGR,width=1,fix="center") %>% unname %>% as.character
dfSitesGR <- as.data.frame(sitesGR)
peaks_SitesDF <- as.data.frame(peaks_Sites)
mergePeaksAndSites[[i]] <- merge(peaks_SitesDF,dfSitesGR,by.x="extendedPeak",by.y="PeakID",all=FALSE)
patternCallList[[i]] <- mergePeaksAndSites[[i]] %>% dplyr::select(Seq,SeqExtended,startOfPmatch,centerOfPattern,siteOfPattern,originalPeak,extendedPeak)
rm(peaks_SitesDF)
rm(peaks_Sites)
}
names(mergePeaksAndSites) <- names(patternCallList) <- as.character(pattern)
return(list(mergePeaksAndSites,patternCallList))
}
#add fun to add regression line with equation from ggplot https://gist.github.com/kdauria/524eade46135f6348140
stat_smooth_func <- function(mapping = NULL, data = NULL,
geom = "smooth", position = "identity",
...,
method = "auto",
formula = y ~ x,
se = TRUE,
n = 80,
span = 0.75,
fullrange = FALSE,
level = 0.95,
method.args = list(),
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE,
xpos = NULL,
ypos = NULL) {
layer(
data = data,
mapping = mapping,
stat = StatSmoothFunc,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
method = method,
formula = formula,
se = se,
n = n,
fullrange = fullrange,
level = level,
na.rm = na.rm,
method.args = method.args,
span = span,
xpos = xpos,
ypos = ypos,
...
)
)
}
StatSmoothFunc <- ggproto("StatSmooth", Stat,
setup_params = function(data, params) {
# Figure out what type of smoothing to do: loess for small datasets,
# gam with a cubic regression basis for large data
# This is based on the size of the _largest_ group.
if (identical(params$method, "auto")) {
max_group <- max(table(data$group))
if (max_group < 1000) {
params$method <- "loess"
} else {
params$method <- "gam"
params$formula <- y ~ s(x, bs = "cs")
}
}
if (identical(params$method, "gam")) {
params$method <- mgcv::gam
}
params
},
compute_group = function(data, scales, method = "auto", formula = y~x,
se = TRUE, n = 80, span = 0.75, fullrange = FALSE,
xseq = NULL, level = 0.95, method.args = list(),
na.rm = FALSE, xpos=NULL, ypos=NULL) {
if (length(unique(data$x)) < 2) {
# Not enough data to perform fit
return(data.frame())
}
if (is.null(data$weight)) data$weight <- 1
if (is.null(xseq)) {
if (is.integer(data$x)) {
if (fullrange) {
xseq <- scales$x$dimension()
} else {
xseq <- sort(unique(data$x))
}
} else {
if (fullrange) {
range <- scales$x$dimension()
} else {
range <- range(data$x, na.rm = TRUE)
}
xseq <- seq(range[1], range[2], length.out = n)
}
}
# Special case span because it's the most commonly used model argument
if (identical(method, "loess")) {
method.args$span <- span
}
if (is.character(method)) method <- match.fun(method)
base.args <- list(quote(formula), data = quote(data), weights = quote(weight))
model <- do.call(method, c(base.args, method.args))
m = model
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 3),
b = format(coef(m)[2], digits = 3),
r2 = format(summary(m)$r.squared, digits = 3)))
func_string = as.character(as.expression(eq))
if(is.null(xpos)) xpos = min(data$x)*0.9
if(is.null(ypos)) ypos = max(data$y)*0.9
data.frame(x=xpos, y=ypos, label=func_string)
},
required_aes = c("x", "y")
)
read in all peaks from matrix building and PCA filtering output; input file “Final_matrix_all_peaks_add_filtering.txt” is peak matrix with all sample and filtering information, generated in PCA_annotation_fgsea_targetvenns script, and available in my Github
all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#for Fig 4, start by trying normalizing to reads in all peaks
norm <- colSums(all[,219:226])
all$log2aegyptiAgo1_norm_target <- log2(((all$aegyptiAgo1_counts+1)/7297949)*1E6)
all$log2aegyptiAgo2_norm_target <- log2(((all$aegyptiAgo2_counts+1)/1481167)*1E6)
all$log2aegyptirIgG_norm_target <- log2(((all$aegyptirIgG_counts+1)/777251)*1E6)
all$log2aegyptimIgG_norm_target <- log2(((all$aegyptimIgG_counts+1)/221290)*1E6)
all$log2Aag2Ago1_norm_target <- log2(((all$Aag2Ago1_counts+1)/10744772)*1E6)
all$log2Aag2Ago2_norm_target <- log2(((all$Aag2Ago2_counts+1)/2851099)*1E6)
all$log2Aag2rIgG_norm_target <- log2(((all$Aag2rIgG_counts+1)/1122306)*1E6)
all$log2Aag2mIgG_norm_target <- log2(((all$Aag2mIgG_counts+1)/1060819)*1E6)
#make GR for seed search
allGR <- makeGRangesFromDataFrame(all,
keep.extra.columns=TRUE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
“all_unique_filt_seeds_rn3.fa” file with unique small RNA 6mer target sequences was generated in smallRNA_abundance_scatterplots script and is available in my Github “Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa” is available for download at Vectorbase
Additionally, see CLIPflexR documentation for:
annotatePeaksWithPatterns
res_Agon <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa",
patterns= "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/all_unique_filt_seeds_rn3.fa",
resize=70,
add5=11,
add3=1,
verbose=TRUE)
## Loading required package: Rsamtools
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 5 out-of-bound ranges located on sequences
## NIGP01000179, NIGP01000530, NIGP01001658, and NIGP01001859. Note
## that ranges located on a sequence whose length is unknown (NA) or
## on a circular sequence are not considered out-of-bound (use
## seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim
## these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 232 patterns
## Searching for 232 patterns in peaks....
## Search for AAAAAA
## Search for AAACCG
## Search for AAAGAC
## Search for AATCTC
## Search for AATGCA
## Search for ACAATA
## Search for ACAATC
## Search for ACAGGG
## Search for ACGCTT
## Search for ACGGGT
## Search for ACTGCC
## Search for AGACTG
## Search for AGCAAT
## Search for AGCACA
## Search for AGCTTT
## Search for AGTACT
## Search for AGTATT
## Search for AGTCAA
## Search for AGTGAG
## Search for ATACTC
## Search for CAACGA
## Search for CAAGGG
## Search for CATATC
## Search for CATCAC
## Search for CATGAC
## Search for CATTCC
## Search for CCAAAG
## Search for CCACCG
## Search for CCGTCC
## Search for CCTCGC
## Search for CGCTTT
## Search for CGTACT
## Search for CTAGTC
## Search for CTCCTC
## Search for CTCTCT
## Search for CTTATG
## Search for CTTGCC
## Search for GAACAA
## Search for GAATTG
## Search for GAGATT
## Search for GATATT
## Search for GATCTC
## Search for GATCTG
## Search for GATTTC
## Search for GCACAA
## Search for GCAGCA
## Search for GCAGCT
## Search for GCATGA
## Search for GCATTT
## Search for GCTATG
## Search for GGACCA
## Search for GGTCAT
## Search for GGTGCT
## Search for GGTTGT
## Search for GGTTTC
## Search for GTACAA
## Search for GTACTT
## Search for GTGGTT
## Search for GTGTTC
## Search for GTTAAC
## Search for GTTACC
## Search for GTTCCT
## Search for TAAACC
## Search for TAAGAT
## Search for TACCGC
## Search for TACCTC
## Search for TACCTG
## Search for TAGCGC
## Search for TATACG
## Search for TCAGGA
## Search for TCAGGG
## Search for TCCACT
## Search for TCTTCC
## Search for TCTTTC
## Search for TGACAC
## Search for TGAGAG
## Search for TGCAAT
## Search for TGCCAA
## Search for TGCCAT
## Search for TGCCTT
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
## NIGP01001658. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## Search for TGCTCA
## Search for TGGTAA
## Search for TGTCGA
## Search for TGTGAT
## Search for TTCTAA
## Search for TTTACG
## Search for TTTTCA
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
## NIGP01000530. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## Search for ATCAAA
## Search for AAATCC
## Search for ACCGCC
## Search for ACCGGA
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CAATCC
## Search for CCAATC
## Search for CCTCTA
## Search for CGGAAT
## Search for CTGGGA
## Search for TGGTCA
## Search for AATCCA
## Search for ATAAAT
## Search for TCCAAA
## Search for TGCCGT
## Search for ACACCA
## Search for ATCCAA
## Search for CAAATC
## Search for CCTCAC
## Search for CGGCCT
## Search for GGCCTA
## Search for TACGTA
## Search for TGGATT
## Search for ACCCAA
## Search for CTACCT
## Search for ATGTCG
## Search for TATGGG
## Search for TCACTA
## Search for AAATAA
## Search for CAGTGT
## Search for CCAAAA
## Search for CCCAGG
## Search for CTTCCA
## Search for GACCAA
## Search for GCGTTT
## Search for GCTCAC
## Search for GTCCCA
## Search for TCCCAG
## Search for TTTTAT
## Search for AACAGC
## Search for AATGAG
## Search for AGCGTC
## Search for CATCCA
## Search for CCTGGA
## Search for CCTTCT
## Search for CTCAAC
## Search for TAGTGA
## Search for TATTTT
## Search for TGCGTT
## Search for AAGTCA
## Search for ACTTTT
## Search for CAGACA
## Search for CCGGAT
## Search for CTGAAA
## Search for CTGTCA
## Search for GCCTCT
## Search for GGATGT
## Search for GGGTCC
## Search for TGTATC
## Search for TGTGGA
## Search for TGTTTG
## Search for AAAATC
## Search for AAACAC
## Search for ACGCCA
## Search for ACGCTA
## Search for AGCACT
## Search for AGTCCC
## Search for ATGTTT
## Search for CAGAAT
## Search for CAGCCC
## Search for CATAAG
## Search for CATTTA
## Search for CCCTAT
## Search for CCTCCA
## Search for CGACGA
## Search for CGAGAA
## Search for CGCCTC
## Search for CTGTAT
## Search for CTTGGA
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
## NIGP01000720. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## Search for GAGGAG
## Search for GATGGC
## Search for GCACTT
## Search for GCCGCA
## Search for GCTTGG
## Search for GGTGAC
## Search for GTAAAT
## Search for GTATCA
## Search for GTGTTT
## Search for GTTTGG
## Search for GTTTTC
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
## NIGP01000530. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths
## and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
## more information.
## Search for TACGCC
## Search for TACTAG
## Search for TCAAAT
## Search for TCACCA
## Search for TCACCG
## Search for TCCAGA
## Search for TCTCTA
## Search for TCTCTC
## Search for TTCACC
## Search for TTCAGA
## Search for AACCAC
## Search for AATGAT
## Search for ACAGGT
## Search for ACCACA
## Search for AGCCAC
## Search for ATCGCT
## Search for ATGGCT
## Search for CAGAGC
## Search for CCAGAG
## Search for CCAGAT
## Search for CGAAAC
## Search for CGATTG
## Search for CGATTT
## Search for CGCCGC
## Search for CGCTTC
## Search for CTCTAG
## Search for CTCTAT
## Search for CTCTCC
## Search for GAACAG
## Search for GAGACT
## Search for GGTTGA
## Search for TAAGTA
## Search for TACCGT
## Search for TAGTGC
## Search for TCAAGA
## Search for TCCCAT
## Search for TCTAGG
## Search for TGCCTC
## Search for TGTCGT
## Search for TTAGTG
## Search for TTCAGC
## Search for TTCTGT
## Search for TTGATG
## Search for CAGCCT
## Search for GAATTA
## Search for TCAGCT
## Search for TTATCA
## Search for AATTAA
## Search for ACTGGG
## Search for ATGGCA
res_Agon <- res_Agon[[1]]
res_Agon <- rbindlist(res_Agon)
#make sure are only counting pattern matches 1X
#if sequence searched AND pattern position are exactly the same, is a duplicate
res_Agon$search_seq_pattern <- paste0(res_Agon$seqUnderExtendedPeak, "_", res_Agon$siteOfPattern)
length(unique(res_Agon$search_seq_pattern))
## [1] 429281
nrow(res_Agon)
## [1] 429281
res_Agon$search_seq_pattern <- NULL
#but, could still have dups if extended sequence of one peak overlaps with the extended peak of another sequence and then a match is in both
#if site of pattern and extended sequence is the same, show only the peak with the start of pMatch closest to the center
res_Agon$group <- paste0(res_Agon$SeqExtended, "_", res_Agon$siteOfPattern)
length(unique(res_Agon$group)) #384778
## [1] 384778
res_Agon$startOfPmatch <- as.numeric(res_Agon$startOfPmatch)
res_Agon$filt <- duplicated(res_Agon$group)
res_Agon$center_dist <- res_Agon$startOfPmatch - 33
res_Agon$distance_from_zero <- ifelse(res_Agon$center_dist >=0, paste0(res_Agon$center_dist), res_Agon$center_dist * -1 )
res_Agon$TSSdistance_from_zero <- ifelse(res_Agon$distanceToTSS >=0, paste0(res_Agon$distanceToTSS), res_Agon$distanceToTSS * -1 )
res_Agon$distance_from_zero <- as.numeric(res_Agon$distance_from_zero)
res_Agon$TSSdistance_from_zero <- as.numeric(res_Agon$TSSdistance_from_zero)
h <- res_Agon %>% group_by(group) %>% top_n(-1, distance_from_zero) #385574 # still some dups, ties I guess (yep)
length(unique(h$group)) #384778
## [1] 384778
#can further break ties by sorting by the distance to TSS
h$dups <- duplicated(h$group)
q <- h[!is.na(h$TSSdistance_from_zero),] #get only ones with annotation, because only these ones have TSS distance
k <- h[is.na(h$TSSdistance_from_zero),] #7044
h <- q %>% group_by(group) %>% top_n(-1, TSSdistance_from_zero) #377740
q <- rbind(h, k) #384784
q$dups <- duplicated(q$group)
#if I cannot assign, throw dups away randomly
res_Agon_to_join <- subset(q, q$dups==FALSE) ##384778
res_Agon_to_join$originalPeak <- gsub("\\_-" , "\\:-", res_Agon_to_join$originalPeak)
res_Agon_to_join$originalPeak <- gsub("\\_[+]" , "\\:+",res_Agon_to_join$originalPeak)
#all sRNAs
res_Agon_to_join$strand.y <- res_Agon_to_join$strand.x
res_Agon_to_join <- res_Agon_to_join[,c(7, 1, 264:279)]
col.names <- colnames(res_Agon_to_join)
col.names <- gsub("\\.y", "", col.names)
col.names <- gsub("center_dist", "Pdistance_to_peak_center", col.names)
res_Agon_to_join <- setNames(res_Agon_to_join, col.names)
col.names <- colnames(res_Agon_to_join)
col.names <- c(col.names[1:3], paste0("P",col.names[4:8]), col.names[9:18])
res_Agon_to_join <- setNames(res_Agon_to_join, col.names)
#merge with original peak matrix - will increase row # because have potentially more than one 6mer match/peak
res_all <- merge(all, res_Agon_to_join, by="peakID", all.x = TRUE)
“Table_S4.txt” table was generated in smallRNA_abundance_scatterplots script
Also need raw counts to normalize with a pseudocount of 1 for log2 visualization: “novel_miRNAs_filtered_counts_all_info.txt”, “known_miRNAs_allcols.txt”, and “all_putative_known_sRNA_seeds2.txt” files were generated in the mirdeep2_processing_filtering_to_upload script
All input files are available in my Github
seeds_df <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Mol_Cell_resub/Revision_files/Supplementary_Tables/Table_S4.txt", header =TRUE, sep = "\t")
seeds_df_byfam <- seeds_df %>% group_by(.dots=c("six_mer_target", "six_mer")) %>% summarise(miRNA_family=first(aae_smallRNA_family))
res_all <- merge(res_all, seeds_df_byfam, by.x = "Seq", by.y = "six_mer_target", all.x = TRUE)
novel_filtered <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/novel_miRNAs_filtered_counts_all_info.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
known_filtered <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/known_miRNAs_allcols.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
known_filtered <- known_filtered[,c(1:150)]
all_sRNA_filtered <- rbind(known_filtered,novel_filtered)
seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds2.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
all_sRNA_byfam <- merge(all_sRNA_filtered, seed_table[,c("Row.names", "six_mer")], by.x = "miRNA", by.y = "Row.names")
##have duplicate seeds in all_sRNA_by_fam since have considered known and novel differently and now are bound together, so need to collapse by seed again
all_sRNA_byfam1 <- all_sRNA_byfam %>% group_by(six_mer) %>% summarise( log2aegyptiAgo1_norm_miRNA=log2(sum(aegyptiAgo1_counts_norm)), log2aegyptiAgo2_norm_miRNA=log2(sum(aegyptiAgo2_counts_norm)), log2aegyptirIgG_norm_miRNA=log2(sum(aegyptirIgG_counts_norm)), log2aegyptimIgG_norm_miRNA=log2(sum(aegyptimIgG_counts_norm)), log2Aag2Ago1_norm_miRNA=log2(sum(Aag2Ago1_counts_norm)), log2Aag2Ago2_norm_miRNA=log2(sum(Aag2Ago2_counts_norm)),log2Aag2rIgG_norm_miRNA=log2(sum(Aag2rIgG_counts_norm)), log2Aag2mIgG_norm_miRNA=log2(sum(Aag2mIgG_counts_norm)))
res_all <- merge(res_all, all_sRNA_byfam1, by= "six_mer", all.x=TRUE)
#remove ones without pattern match or annotation for visualization
res_all <- res_all[!is.na(res_all$six_mer),]
res_all <- res_all[!is.na(res_all$distanceToTSS),]
#simplify ann groups
res_all$ann_group <- ifelse(grepl("Exon", res_all$annotation), paste0("Exon"), paste0(res_all$annotation))
res_all$ann_group <- ifelse(grepl("Intron", res_all$ann_group), paste0("Intron"), paste0(res_all$ann_group))
Input files “Final_cells_Ago2_matrix_filtered_peaks.txt” and “Final_aegypti_Ago2_matrix_filtered_peaks.txt” are available in my Github
AaegL5 LVP_AGWG genome gtf is available for download at Vectorbase or available upon request
# need to filter on miRNA cols for AGO2
hi_ago2 <- res_all[res_all$log2Aag2Ago2_norm_miRNA > 9 & res_all$log2Aag2Ago1_norm_miRNA<9 & res_all$log2aegyptiAgo2_norm_miRNA > 9 & res_all$log2aegyptiAgo1_norm_miRNA<9,]
#select filtered peaks
cells_Ago2_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_cells_Ago2_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
Ago2_aag2hi <- hi_ago2[hi_ago2$peakID %in% cells_Ago2_peaks_filt$peakID,]
ae_Ago2_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_aegypti_Ago2_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
Ago2_aehi <-hi_ago2[hi_ago2$peakID %in% ae_Ago2_peaks_filt$peakID,]
Aag2hiGR <- makeGRangesFromDataFrame(Ago2_aag2hi ,
keep.extra.columns=FALSE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
gtffile <- "/Users/kathryn/Aedes-aegypti-LVP_AGWG_BASEFEATURES_AaegL5.2.gtf"
TxDb <- makeTxDbFromGFF(gtffile)
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(type, phase): The "phase" metadata column contains non-NA values for features of
## type stop_codon. This information was ignored.
## OK
AGOhianno <- annotatePeak(Aag2hiGR,
TxDb=TxDb,
sameStrand=FALSE,
genomicAnnotationPriority = c("3UTR", "5UTR", "Exon",
"Intron", "Promoter", "Downstream", "Intergenic"),
overlap = "all")
## >> preparing features information... 2021-01-25 16:16:15
## >> identifying nearest features... 2021-01-25 16:16:15
## >> calculating distance from peak to TSS... 2021-01-25 16:16:16
## >> assigning genomic annotation... 2021-01-25 16:16:16
## >> assigning chromosome lengths 2021-01-25 16:16:21
## >> done... 2021-01-25 16:16:21
plotAnnoPie(AGOhianno)
AehiGR <- makeGRangesFromDataFrame(Ago2_aehi ,
keep.extra.columns=FALSE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field="seqnames",
start.field="start",
end.field="end",
strand.field="strand",
starts.in.df.are.0based=FALSE)
AGOaehianno <- annotatePeak(AehiGR,
TxDb=TxDb,
sameStrand=FALSE,
genomicAnnotationPriority = c("3UTR", "5UTR", "Exon",
"Intron", "Promoter", "Downstream", "Intergenic"),
overlap = "all")
## >> preparing features information... 2021-01-25 16:16:21
## >> identifying nearest features... 2021-01-25 16:16:21
## >> calculating distance from peak to TSS... 2021-01-25 16:16:21
## >> assigning genomic annotation... 2021-01-25 16:16:21
## >> assigning chromosome lengths 2021-01-25 16:16:22
## >> done... 2021-01-25 16:16:22
plotAnnoPie(AGOaehianno)
#Add annotation column for “top” (ie. abundant) small RNA and classify by type (known/novel)
“all_Ago1_known_novel_fam_rn.fa” & “all_Ago2_known_novel_fam_rn.fa” files were generated in smallRNA_abundance_scatterplots_to_upload script and are available in my Github
Ago1_sRNAs <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/all_Ago1_known_novel_fam_rn.fa", format = "fasta")
Ago1_sRNAs_cells <- Ago1_sRNAs[grepl("common-Ago1|Aag2-Ago1|Aag2-Ago1|top-Aag2|top-both", names(Ago1_sRNAs))]
Ago1_sRNAs_aegypti <- Ago1_sRNAs[grepl("common-Ago1|aegypti-Ago1|top-aegypti|top-both", names(Ago1_sRNAs))]
res_all$top_smallRNA <- ifelse(res_all$Seq %in% Ago1_sRNAs_cells & res_all$Seq %in% Ago1_sRNAs_aegypti, paste0("Ago1_both"), paste0("NA"))
res_all$top_smallRNA <- ifelse(res_all$Seq %in% Ago1_sRNAs_cells & !res_all$Seq %in% Ago1_sRNAs_aegypti, paste0("Ago1_cells"), paste0(res_all$top_smallRNA))
res_all$top_smallRNA <- ifelse(!res_all$Seq %in% Ago1_sRNAs_cells & res_all$Seq %in% Ago1_sRNAs_aegypti, paste0("Ago1_aegypti"), paste0(res_all$top_smallRNA))
Ago2_sRNAs <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/all_Ago2_known_novel_fam_rn.fa", format = "fasta")
Ago2_sRNAs_cells <- Ago2_sRNAs[grepl("common-Ago2|Aag2-Ago2|Aag2-Ago2|aae-miR", names(Ago2_sRNAs))]
Ago2_sRNAs_aegypti <- Ago2_sRNAs[grepl("common-Ago2|aegypti-Ago2|aae-miR", names(Ago2_sRNAs))]
res_all$top_smallRNA2 <- ifelse(res_all$Seq %in% Ago2_sRNAs_cells & res_all$Seq %in% Ago2_sRNAs_aegypti, paste0("Ago2_both"), paste0("NA"))
res_all$top_smallRNA2 <- ifelse(res_all$Seq %in% Ago2_sRNAs_cells & !res_all$Seq %in% Ago2_sRNAs_aegypti, paste0("Ago2_cells"), paste0(res_all$top_smallRNA2))
res_all$top_smallRNA2 <- ifelse(!res_all$Seq %in% Ago2_sRNAs_cells & res_all$Seq %in% Ago2_sRNAs_aegypti, paste0("Ago2_aegypti"), paste0(res_all$top_smallRNA2))
res_all$top_smallRNA <- ifelse(res_all$top_smallRNA!="NA" & res_all$top_smallRNA2!="NA", paste0(res_all$top_smallRNA, "; ", res_all$top_smallRNA2), paste0(res_all$top_smallRNA))
res_all$top_smallRNA <- ifelse(res_all$top_smallRNA=="NA", paste0(res_all$top_smallRNA2), paste0(res_all$top_smallRNA))
res_all$top_smallRNA2 <- NULL
res_Ago1_BCsample <- subset(res_all, res_all$Aag2_Ago1_BCsample > 0 | res_all$aegypti_Ago1_BCsample > 0)
#get 3'UTR
res_Ago1_BCsample_3UTR <- res_Ago1_BCsample[grepl("3' UTR", res_Ago1_BCsample$annotation),]
#need to summarize all miRNA and target expression cols by miRNA family
res_Ago1_BCsample_3UTRgraph <- res_Ago1_BCsample_3UTR %>% group_by(miRNA_family) %>% summarize(avlog2aegyptiAgo1_norm_miRNA=mean(log2aegyptiAgo1_norm_miRNA), avlog2aegyptiAgo2_norm_miRNA=mean(log2aegyptiAgo2_norm_miRNA), avlog2aegyptirIgG_norm_miRNA=mean(log2aegyptirIgG_norm_miRNA), avlog2aegyptimIgG_norm_miRNA=mean(log2aegyptimIgG_norm_miRNA), avlog2Aag2Ago1_norm_miRNA=mean(log2Aag2Ago1_norm_miRNA), avlog2Aag2Ago2_norm_miRNA=mean(log2Aag2Ago2_norm_miRNA),log2Aag2rIgG_norm_miRNA=mean(log2Aag2rIgG_norm_miRNA), avlog2Aag2mIgG_norm_miRNA=mean(log2Aag2mIgG_norm_miRNA),
avlog2aegyptiAgo1_norm_target=mean(log2aegyptiAgo1_norm_target), avlog2aegyptiAgo2_norm_target=mean(log2aegyptiAgo2_norm_target), avlog2aegyptirIgG_norm_target=mean(log2aegyptirIgG_norm_target), avlog2aegyptimIgG_norm_target=mean(log2aegyptimIgG_norm_target), avlog2Aag2Ago1_norm_target=mean(log2Aag2Ago1_norm_target), avlog2Aag2Ago2_norm_target=mean(log2Aag2Ago2_norm_target),log2Aag2rIgG_norm_target=mean(log2Aag2rIgG_norm_target), avlog2Aag2mIgG_norm_target=mean(log2Aag2mIgG_norm_target), avdistance_topeakcenter=mean(Pdistance_to_peak_center), avdistancetoTSS=mean(distanceToTSS), target = first(Seq), six_mer = first(six_mer), top_smallRNA = first(top_smallRNA))
#classify all types I want to highlight
res_Ago1_BCsample_3UTRgraph$type <- ifelse(grepl("novel",res_Ago1_BCsample_3UTRgraph$miRNA_family), paste0("novel"), paste0("known"))
res_Ago1_cells_BCsample_3UTRgraph <- res_Ago1_BCsample_3UTRgraph
res_Ago1_cells_BCsample_3UTRgraph$highlight <- ifelse(grepl(".*(Ago1_cells|Ago1_both).*", res_Ago1_cells_BCsample_3UTRgraph$top_smallRNA), paste0("top"), paste0("NA"))
#get linear regression for all data
linearMod <- lm(avlog2Aag2Ago1_norm_target ~ avlog2Aag2Ago1_norm_miRNA, data=subset(res_Ago1_BCsample_3UTRgraph,res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA))
#0.40329 = Intercept 0.03482 = Slope
summary(linearMod) #Multiple R-squared: 0.1571
##
## Call:
## lm(formula = avlog2Aag2Ago1_norm_target ~ avlog2Aag2Ago1_norm_miRNA,
## data = subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA >
## res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03905 -0.18342 0.02947 0.16297 0.97795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.403284 0.040743 9.898 < 2e-16 ***
## avlog2Aag2Ago1_norm_miRNA 0.034824 0.006132 5.679 5.62e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2917 on 173 degrees of freedom
## Multiple R-squared: 0.1571, Adjusted R-squared: 0.1523
## F-statistic: 32.25 on 1 and 173 DF, p-value: 5.62e-08
#now for highly expressed miRNAs
linearModsub <- lm(avlog2Aag2Ago1_norm_target ~ avlog2Aag2Ago1_norm_miRNA, data=subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > 9))
# intercept -0.93545 slope: 0.15491
summary(linearModsub) #Multiple R-squared:0.3673
##
## Call:
## lm(formula = avlog2Aag2Ago1_norm_target ~ avlog2Aag2Ago1_norm_miRNA,
## data = subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA >
## res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA &
## res_Ago1_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA >
## 9))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.04560 -0.23167 -0.08035 0.22470 0.75603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.93545 0.42288 -2.212 0.034461 *
## avlog2Aag2Ago1_norm_miRNA 0.15491 0.03651 4.242 0.000185 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3745 on 31 degrees of freedom
## Multiple R-squared: 0.3673, Adjusted R-squared: 0.3469
## F-statistic: 18 on 1 and 31 DF, p-value: 0.0001855
cor.test(subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA> res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA), subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_target, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA> res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA), alternative = "greater",
method = c("pearson")) #p-value = 2.81e-08, r = 0.396404
##
## Pearson's product-moment correlation
##
## data: subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA, and subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_target, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > and res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA) and res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA)
## t = 5.6791, df = 173, p-value = 2.81e-08
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.2857722 1.0000000
## sample estimates:
## cor
## 0.396404
cor.test(subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA> res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > 9), subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_target, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA> res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > 9), alternative = "greater",
method = c("pearson")) #p-value = 9.273e-05, r = 0.6060637
##
## Pearson's product-moment correlation
##
## data: subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA, and subset(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_target, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > and res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & and res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > and res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > 9) and 9)
## t = 4.2423, df = 31, p-value = 9.273e-05
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.3819736 1.0000000
## sample estimates:
## cor
## 0.6060637
ggplot(aes(x=avlog2Aag2Ago1_norm_miRNA,y=avlog2Aag2Ago1_norm_target), data=subset(res_Ago1_cells_BCsample_3UTRgraph, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA)) +
geom_smooth(method="lm",se=FALSE, colour= "black") + stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_point(aes(shape =type, colour=highlight), size=4) + scale_shape_manual(values=c(19,1)) + scale_colour_manual(values = c( "gray", "#000099")) + geom_abline(intercept = -0.93545, slope = 0.15491 , linetype="dashed") +
theme_bw() + geom_text_repel(data = subset(res_Ago1_cells_BCsample_3UTRgraph, res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA & res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA>9 & res_Ago1_cells_BCsample_3UTRgraph$highlight=="top"), aes(label = miRNA_family), size=2, force=5)
#linear regression for all data
linearMod <- lm(avlog2aegyptiAgo1_norm_target ~ avlog2aegyptiAgo1_norm_miRNA, data=subset(res_Ago1_BCsample_3UTRgraph,res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA))
#-1.06234 = Intercept 0.023151 = Slope
summary(linearMod) #Multiple R-squared: 0.06666
##
## Call:
## lm(formula = avlog2aegyptiAgo1_norm_target ~ avlog2aegyptiAgo1_norm_miRNA,
## data = subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA >
## res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67681 -0.21461 -0.03956 0.13292 1.22935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.062340 0.064727 -16.413 < 2e-16 ***
## avlog2aegyptiAgo1_norm_miRNA 0.023151 0.007843 2.952 0.00379 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.327 on 122 degrees of freedom
## Multiple R-squared: 0.06666, Adjusted R-squared: 0.05901
## F-statistic: 8.714 on 1 and 122 DF, p-value: 0.003789
#now for highly expressed miRNAs
linearModsub <- lm(avlog2aegyptiAgo1_norm_target ~ avlog2aegyptiAgo1_norm_miRNA, data=subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > 9))
# intercept -2.07818 slope: 0.11344
summary(linearModsub) #Multiple R-squared: 0.2979,
##
## Call:
## lm(formula = avlog2aegyptiAgo1_norm_target ~ avlog2aegyptiAgo1_norm_miRNA,
## data = subset(res_Ago1_BCsample_3UTRgraph, res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA >
## res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA &
## res_Ago1_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA >
## 9))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48723 -0.17622 -0.08715 0.15472 0.91655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.07818 0.29649 -7.009 9.88e-09 ***
## avlog2aegyptiAgo1_norm_miRNA 0.11344 0.02596 4.369 7.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3181 on 45 degrees of freedom
## Multiple R-squared: 0.2979, Adjusted R-squared: 0.2823
## F-statistic: 19.09 on 1 and 45 DF, p-value: 7.251e-05
res_Ago1_aegypti_BCsample_3UTRgraph <- res_Ago1_BCsample_3UTRgraph
res_Ago1_aegypti_BCsample_3UTRgraph$highlight <- ifelse(grepl(".*(Ago1_aegypti|Ago1_both).*", res_Ago1_aegypti_BCsample_3UTRgraph$top_smallRNA), paste0("top"), paste0("NA"))
cor.test(subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA> res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA ), subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_target, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA> res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA ), alternative = "greater",
method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA, and subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_target, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > and res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA) and res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA)
## t = 2.9519, df = 122, p-value = 0.001895
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.1141349 1.0000000
## sample estimates:
## cor
## 0.2581885
cor.test(subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA> res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > 9), subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_target, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA> res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > 9), alternative = "greater",
method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA, and subset(res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_target, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > and res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & and res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > and res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > 9) and 9)
## t = 4.3695, df = 45, p-value = 3.625e-05
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.3490822 1.0000000
## sample estimates:
## cor
## 0.5457927
ggplot(aes(x=avlog2aegyptiAgo1_norm_miRNA,y=avlog2aegyptiAgo1_norm_target), data=subset(res_Ago1_aegypti_BCsample_3UTRgraph, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA)) +
geom_smooth(method="lm",se=FALSE, colour= "black") + stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_point(aes(shape =type, colour=highlight), size=4) + scale_shape_manual(values=c(19,1)) + scale_colour_manual(values = c( "gray", "#3366FF")) + geom_abline(intercept = -2.07818, slope = 0.11344 , linetype="dashed") +
theme_bw() + geom_text_repel(data = subset(res_Ago1_aegypti_BCsample_3UTRgraph, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA & res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA>9 & res_Ago1_aegypti_BCsample_3UTRgraph$highlight=="top"), aes(label = miRNA_family), size=2, force=10)
#pies
all_sRNA_byfam2 <- seeds_df %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family= first(aae_smallRNA_family), aegyptiAgo1_norm_miRNA=sum(aegyptiAGO1_counts_norm), aegyptiAgo2_norm_miRNA=sum(aegyptiAGO2_counts_norm), Aag2Ago1_norm_miRNA=sum(Aag2AGO1_counts_norm), Aag2Ago2_norm_miRNA=sum(Aag2AGO2_counts_norm))
select_mosq <- subset(res_Ago1_aegypti_BCsample_3UTRgraph, res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo1_norm_miRNA > res_Ago1_aegypti_BCsample_3UTRgraph$avlog2aegyptiAgo2_norm_miRNA)
aegypti_pie <- merge(all_sRNA_byfam2, select_mosq[,c("target", "top_smallRNA", "type", "highlight")], by.x ="six_mer_target", by.y = "target")
aegypti_pie2 <- aegypti_pie %>%
mutate_if(is.numeric, funs(./sum(.)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
aegypti_pie2$type <- ifelse(grepl("top", aegypti_pie2$highlight), paste0(aegypti_pie2$type , "_top"), paste0(aegypti_pie2$type))
aegypti_pie2 <- aegypti_pie2 %>% group_by(type) %>% summarise(percent=sum(aegyptiAgo1_norm_miRNA))
ggplot(aegypti_pie2, aes(x="", y=percent, fill=type, alpha=type)) +
geom_bar(stat="identity") +
coord_polar("y", start=0) + theme_void() +
scale_fill_manual(values=c("gray", "#3360A9", "gray", "#3360A9")) + scale_alpha_manual(values=c(1, 1, 0.5, 0.5))
select_cells <- res_Ago1_cells_BCsample_3UTRgraph %>% filter(res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago1_norm_miRNA > res_Ago1_cells_BCsample_3UTRgraph$avlog2Aag2Ago2_norm_miRNA)
Aag2_pie <- merge(all_sRNA_byfam2, select_cells[,c("target", "top_smallRNA", "type", "highlight")], by.x ="six_mer_target", by.y = "target")
Aag2_pie <- Aag2_pie %>%
mutate_if(is.numeric, funs(./sum(.)))
Aag2_pie$type <- ifelse(grepl("top", Aag2_pie$highlight), paste0(Aag2_pie$type , "_top"), paste0(Aag2_pie$type))
Aag2_pie <- Aag2_pie %>% group_by(type) %>% summarise(percent=sum(Aag2Ago1_norm_miRNA))
ggplot(Aag2_pie, aes(x="", y=percent, fill=type, alpha=type)) +
geom_bar(stat="identity") +
coord_polar("y", start=0) + theme_void() +
scale_fill_manual(values=c("gray", "#1B0B80", "gray", "#1B0B80")) + scale_alpha_manual(values=c(1, 1, 0.5, 0.5))
# #clean up table to remove excess columns; then add different target types (7A1, 7M8, 8, 18mer 0-mismatch, 18mer 1-mismatch)
#
# clean <- res_all
# cleand <- clean[,c(1:8, 203:255,264:265, 272:274, 278, 280, 290)]
#
# #seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds2.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE) #913 unique seed combos
#
# #link to new names
# seed_table <- merge(seeds_df[,c(1:2, 38)], seed_table,by.x = "sequence" , by.y = "FL", all.x= TRUE )
#
# seed_7A1 <- seed_table %>% group_by(seven_A1_target) %>% summarise(sevenA1_target = first(aae_smallRNA_family))
#
# seed_7M8 <- seed_table %>% group_by(.dots=c("six_mer_target", "seven_M8_target")) %>% summarise(sevenM8_target = paste0(smallRNA, collapse =":"), sevenM8_targetfam = first(aae_smallRNA_family))
#
# seed_8mer <- seed_table %>% group_by(eight_target) %>% summarise(eight_target_fam = paste0(smallRNA, collapse =":"))
#
# seed_8mer_alt <- seed_table %>% group_by(eight_alt_target) %>% summarise(eight_alt_target_fam = paste0(smallRNA, collapse =":"))
#
# cleand <- merge(cleand, seed_7A1, by.x = "miRNA_family", by.y = "sevenA1_target", all.x = TRUE)
# cleand$check <- mapply(grepl, pattern=cleand$seven_A1_target, x=cleand$SeqExtended)
# cleand$seven_A1_target <- ifelse(cleand$check==TRUE, paste0(cleand$miRNA_family), NA)
#
# #7M8
# cleand$check <- ifelse(cleand$strand == "+", mapply(substr, x=cleand$SeqExtended, start=11, stop=17), mapply(substr, x=cleand$SeqExtended, start=1, stop=7))
# cleandd <- merge(cleand, seed_7M8[,c(2:3)], by.x = "check", by.y = "seven_M8_target", all.x = TRUE)
#
# #8mers
# cleandd$check <- ifelse(cleandd$strand == "+", mapply(substr, x=cleandd$SeqExtended, start=11, stop=18), mapply(substr, x=cleandd$SeqExtended, start=1, stop=8))
#
# #canonical miRNA 8mer
# cleandd <- merge(cleandd, seed_8mer, by.x = "check", by.y = "eight_target", all.x = TRUE)
#
# #alternative 8mer, with full complementarity instead of "A" in 3' target position
# cleandd <- merge(cleandd, seed_8mer_alt , by.x = "check", by.y = "eight_alt_target", all.x = TRUE)
#
# #perfect
# seed_table$Ago2_top <- seed_table$six_mer_target %in% Ago2_sRNAs
# Ago2_tom <- seed_table[seed_table$Ago2_top==TRUE,]
# Ago2_tom <- Ago2_tom%>% group_by(.dots=c("six_mer_target", "six_mer", "aae_smallRNA_family")) %>% summarise(smallRNAs = paste0(smallRNA, collapse = ":"), FL_target_seqs = paste0(FL_target, collapse = ":"))
#
# cleanddd <- merge(cleandd, Ago2_tom[,c(1,3,5)], by.x= "Seq", by.y = "six_mer_target",all.x = TRUE )
# cleanddd$check <- mapply(grepl, pattern=cleanddd$SeqExtended, x=cleanddd$FL_target_seqs)
# cleanddd$perfect_18mer_target <- ifelse(cleanddd$check==TRUE, paste0(cleanddd$aae_smallRNA_family), NA)
#
# #one mismatch
# cleanddd$check <- mapply(agrepl, pattern=cleanddd$SeqExtended, x=cleanddd$FL_target_seqs, max=1)
# cleanddd$one_mis_18mer_target <- ifelse(cleanddd$check==TRUE, paste0(cleanddd$aae_smallRNA_family), NA)
#
# to_coll <- cleanddd[,c(1, 3:5, 64:78)]
# #write.table(to_coll,"/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_all_info_082120_rn.txt", col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
#
# #collapse predicted pattern matches; first within family - did this on the hpc, too slow otherwise
#
# #test_coll <- to_coll %>% group_by(.dots=c("peakID", "six_mer")) %>% summarise(six_target = dplyr::first(Seq), smallRNA_family = dplyr::first(miRNA_family), extendedPeak = dplyr::first(extendedPeak), seqUnderExtendedPeak = dplyr::first(seqUnderExtendedPeak), SeqExtended = paste0(SeqExtended, collapse = ": "), siteOfPattern = paste0(siteOfPattern, collapse = ": "), centerOfPattern = paste0(centerOfPattern, collapse = ": "), Pdistance_to_peak_center = paste0(distance_from_zero, collapse = ": "), top_smallRNA = dplyr::first(top_smallRNA), Psites_within_family = n(), seven_A1_target = paste0(seven_A1_target, collapse = ": "), seven_M8_target = paste0(sevenM8_target, collapse = ": "), eight_target = paste0(eight_target_fam, collapse = ": "), eight_target_alt = paste0(eight_alt_target_fam, collapse = ": "), perfect_18mer_target = paste0(perfect_18mer_target, collapse= ": "), one_mis_18mer_target = paste0(one_mis_18mer_target, collapse= ": "))
#
#
# #check and reformat collapse within family to simplify
# test_coll <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_all_info_082120_rn_col1.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# t <- str_split(test_coll$seven_A1_target, ": ")
#
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$seven_A1_target <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# t <- str_split(test_coll$seven_M8_target, ": ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$seven_M8_target <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# t <- str_split(test_coll$eight_target, ": ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$eight_target <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# t <- str_split(test_coll$eight_target_alt, ": ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$eight_target_alt <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# t <- str_split(test_coll$perfect_18mer_target, ": ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$perfect_18mer_target <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# t <- str_split(test_coll$one_mis_18mer_target, ": ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# }
#
# test_coll$one_mis_18mer_target <- unlist(lapply(b,function(x) paste0(x, collapse=": ")))
#
# test_coll[,c(13:18)]<- lapply(test_coll[,c(13:18)],function(x) gsub(": NA", "", x))
# test_coll[,c(13:18)]<- lapply(test_coll[,c(13:18)],function(x) gsub("NA: ", "", x))
#
#
# #write.table(test_coll, "/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_all_info_infam_082120_reform.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
# ##still rigth
#
# #then across family, again on hpc
#
# #pat_coll <- test_coll %>% group_by(peakID) %>% summarise(six_mer = paste0(six_mer, collapse = "; "), Psix_target = paste0(six_target, collapse = "; "), PsmallRNA_family = paste0(smallRNA_family, collapse = "; "), extendedPeak = dplyr::first(extendedPeak), seqUnderExtendedPeak = dplyr::first(seqUnderExtendedPeak), PSeqExtended = paste0(SeqExtended, collapse = "; "), PsiteOfPattern = paste0(siteOfPattern, collapse = "; "), PcenterOfPattern = paste0(centerOfPattern, collapse = "; "), Pdistance_to_peak_center = paste0(Pdistance_to_peak_center, collapse = "; "), Ptop_smallRNA = paste0(top_smallRNA,collapse = "; "), Psites_within_family = paste0(Psites_within_family, collapse = "; ") ,Psites_unique_families= n(), Pseven_A1_target = paste0(seven_A1_target, collapse = "; "), Pseven_M8_target = paste0(seven_M8_target, collapse = "; "), Peight_target = paste0(eight_target, collapse = "; "), Peight_target_alt = paste0(eight_target_alt, collapse = "; "), Pperfect_18mer_target = paste0(perfect_18mer_target, collapse= "; "), Pone_mis_18mer_target = paste0(one_mis_18mer_target, collapse = "; "), aegypti_Ago1_BCsample = dplyr::first(aegypti_Ago1_BCsample), aegypti_Ago2_BCsample=dplyr::first(aegypti_Ago2_BCsample),Aag2_Ago1_BCsample=dplyr::first(Aag2_Ago1_BCsample), Aag2_Ago2_BCsample=dplyr::first(Aag2_Ago2_BCsample), ann_group = dplyr::first(ann_group))
#
# pat_coll <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_all_info_infam_082120_col2.txt", stringsAsFactors = FALSE, header = TRUE, sep = "\t")
#
#
# t <- str_split(pat_coll$Pseven_A1_target, "; ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
#
# }
#
# pat_coll$Pseven_A1_targetc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Pseven_A1_target <-pat_coll$Pseven_A1_targetc
# pat_coll$Pseven_A1_targetc<- NULL
#
#
# t <- str_split(pat_coll$Pseven_M8_target, "; ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
#
# pat_coll$Pseven_M8_targetc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Pseven_M8_target <- pat_coll$Pseven_M8_targetc
# pat_coll$Pseven_M8_targetc <- NULL
#
#
# t <- str_split(pat_coll$Peight_target, "; ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
#
# pat_coll$Peight_targetc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Peight_target <- pat_coll$Peight_targetc
# pat_coll$Peight_targetc <- NULL
#
# t <- str_split(pat_coll$Peight_target_alt, "; ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
# pat_coll$Peight_target_altc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Peight_target_alt <- pat_coll$Peight_target_altc
# pat_coll$Peight_target_altc <- NULL
#
#
# t <- str_split(pat_coll$Pperfect_18mer_target, "; ")
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
#
# pat_coll$Pperfect_18mer_targetc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Pperfect_18mer_target <- pat_coll$Pperfect_18mer_targetc
# pat_coll$Pperfect_18mer_targetc <- NULL
#
# t <- str_split(pat_coll$Pone_mis_18mer_target, "; " )
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
#
# pat_coll$Pone_mis_18mer_target <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
#
# t <- str_split(pat_coll$Ptop_smallRNA, "; ")
#
# b <- vector("list", length(t))
# for(i in 1:length(t)){
# b[[i]] <- unique(t[[i]])
# b[[i]] <- b[[i]][b[[i]]!="NA"]
# }
# pat_coll$Ptop_smallRNAc <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
# pat_coll$Ptop_smallRNA <- pat_coll$Ptop_smallRNAc
# pat_coll$Ptop_smallRNAc <- NULL
#
#
#
# #then need to merge back with peaks without patterns
# colnames(all)
# all_lim <- all[,c(1:6, 201:253)]
# pat_coll <- pat_coll[,c(1:19)]
# all_with_pats <- merge(all_lim, pat_coll, by= "peakID", all.x = TRUE)
#
# #write.table(all_with_pats, "/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_collapsed_082120_reform.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
“chimera_beds.zip” contains the original AaegL5-remapped chimera bedfiles and is available in my Github
# #start with all ranges;extend 70
# allGR70 <- resize(allGR,width=70,fix="center")
# allGR70@elementMetadata <- allGR70@elementMetadata[,c(1, 196:204)]
#
# #read in chimeras
# Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
# allmychimera <- dir(Dir,full.names = TRUE,pattern="*_chimera_remap.bed")
# chimbed <- lapply(allmychimera, import.bed)
# names(chimbed) <- gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped/", "", allmychimera)
# names(chimbed) <- gsub("_remap.bed", "", names(chimbed))
# #subset chimeras by overlaps
#
# chimbed_merge <- lapply(chimbed, function(x) mergeByOverlaps(x, allGR70, type="within"))
#
# test <- lapply(chimbed_merge, function(x) as.data.frame(cbind(as.character(x$name), as.character(x$peakID ))))
#
# col.names <- c("chim_miRNA", "peakID")
# chim_ls <- lapply(test, setNames, col.names)
# #chim_ls <- lapply(chim_ls, setNames, col.names)
# t <- lapply(chim_ls, function(x) smallRNA <- str_split_fixed(x$chim_miRNA, ";", 2)[,2])
# t <- lapply(t, as.data.frame)
# t <- lapply(t, setNames, c("smallRNA"))
#
# x <- mapply(cbind, chim_ls, t, SIMPLIFY=FALSE)
#
# #merge small RNA name from chim with final name annotation
# listm <- seed_table[,c(1:2)]
# listm$reform <- ifelse(grepl("novel", listm$smallRNA), paste0(listm$sequence), paste0(listm$smallRNA))
#
# for (i in 1:length(x)) {
# x[[i]] <- merge(x[[i]], listm, by.x = "smallRNA",by.y = "reform", all.x = TRUE)
# }
#
#
# test <- lapply(x, function(x) x %>% group_by(.dots=c("peakID", "smallRNA.y")) %>% summarize(count=n()))
# test <- lapply(test, as.data.frame)
#
# d <- lapply(test, function(x) x$chim_miRNA <- paste0(x$smallRNA,"x", x$count))
# d<- lapply(d, as.data.frame, stringsAsFactors = FALSE)
# d <- lapply(d, setNames, c("smallRNA_chim"))
#
# chim_test <- mapply(cbind, test, d, SIMPLIFY=FALSE)
#
# #first collapse within family
# j <- lapply(chim_test, function(x) merge(x, seed_table, by.x = "smallRNA.y", by.y = "smallRNA", all.x = TRUE))
# j <- lapply(j, function(x) x %>% group_by(.dots=c("peakID", "six_mer")) %>% summarise(smallRNA_chim = paste0(smallRNA_chim, collapse = ": ")))
# j <- lapply(j, as.data.frame)
#
# #remove NAs
# try <- lapply(j, function(x) x[!is.na(x$six_mer),])
#
# j <- lapply(try, function(x) x %>% group_by(peakID) %>% summarize(smallRNA_chim = paste0(smallRNA_chim, collapse= "; ")))
# j <- lapply(j, as.data.frame)
#
# chim_f <- Reduce(function(x, y) merge(x, y, by = "peakID", all = TRUE), j)
# names(chim_f) <- c("peakID", names(j))
#
# seed_table$Ago1_top_cells <- seed_table$six_mer_target %in% Ago1_sRNAs_cells
# vec1 <- as.character(subset(seed_table$smallRNA, seed_table$Ago1_top_cells==TRUE))
# vec1 <- paste0(vec1, "x")
# seed_table$Ago1_top_aegypti <- seed_table$six_mer_target %in% Ago1_sRNAs_aegypti
#
# vec2 <- as.character(subset(seed_table$smallRNA, seed_table$Ago1_top_aegypti==TRUE))
# vec2 <- paste0(vec2, "x")
#
# for (i in 1:length(vec1)) {
# chim_f$test <- ifelse( grepl(vec1[i], chim_f$Aag2Ago1_chimera ), paste0("TRUE"), paste0(chim_f$test))
# }
# chim_f$test <- ifelse( grepl("TRUE", chim_f$test ), paste0("Ago1_cells"), paste0("NA"))
#
# for (i in 1:length(vec2)) {
# chim_f$test2 <- ifelse( grepl(vec2[i], chim_f$aegyptiAgo1_chimera ), paste0("TRUE"), paste0(chim_f$test2))
# }
# chim_f$test2 <- ifelse( grepl("TRUE", chim_f$test2 ), paste0("Ago1_aegypti"), paste0("NA"))
#
# chim_f$test <- ifelse( chim_f$test!="NA" & chim_f$test2!="NA", paste0("Ago1_both"), paste0(chim_f$test))
# chim_f$test <- ifelse( chim_f$test=="NA" & chim_f$test2!="NA", paste0(chim_f$test2), paste0(chim_f$test))
#
# chim_f$test2 <- NULL
#
# seed_table$Ago2_top_cells <- seed_table$six_mer_target %in% Ago2_sRNAs_cells
# vec3 <- as.character(subset(seed_table$smallRNA, seed_table$Ago2_top_cells==TRUE))
# vec3 <- paste0(vec3, "x")
# seed_table$Ago2_top_aegypti <- seed_table$six_mer_target %in% Ago2_sRNAs_aegypti
# vec4 <- as.character(subset(seed_table$smallRNA, seed_table$Ago2_top_aegypti==TRUE))
# vec4 <- paste0(vec4, "x")
#
# for (i in 1:length(vec3)) {
# chim_f$test2 <- ifelse( grepl(vec3[i], chim_f$Aag2Ago2_chimera ), paste0("TRUE"), paste0(chim_f$test2))
# }
# chim_f$test2 <- ifelse( grepl("TRUE", chim_f$test2 ), paste0("Ago2_cells"), paste0("NA"))
#
# for (i in 1:length(vec4)) {
# chim_f$test3 <- ifelse( grepl(vec4[i], chim_f$aegyptiAgo2_chimera ), paste0("TRUE"), paste0(chim_f$test3))
# }
# chim_f$test3 <- ifelse( grepl("TRUE", chim_f$test3 ), paste0("Ago2_aegypti"), paste0("NA"))
#
# chim_f$test2 <- ifelse( chim_f$test2!="NA" & chim_f$test3!="NA", paste0("Ago2_both"), paste0(chim_f$test2))
# chim_f$test2 <- ifelse( chim_f$test2=="NA" & chim_f$test3!="NA", paste0(chim_f$test3), paste0(chim_f$test2))
#
# chim_f$test3 <- NULL
# chim_f$test <- ifelse( chim_f$test2!="NA" & chim_f$test!="NA", paste0(chim_f$test, "; ", chim_f$test2), paste0(chim_f$test))
# chim_f$test <- ifelse( chim_f$test2!="NA" & chim_f$test=="NA", paste0(chim_f$test2), paste0(chim_f$test))
#
# chim_f$test2 <- NULL
#
#
# ##merge predicted with chimeras
# all_with_pats <- merge(all_with_pats, chim_f, by="peakID", all.x = TRUE)
# all_with_pats <- all_with_pats[,c(1:39,44:86)]
#
# names(all_with_pats)[names(all_with_pats)=="test"] <- "top_smallRNA_chimera"
The 4 input files: “Final_cells_Ago1_matrix_filtered_peaks.txt”, “Final_aegypti_Ago1_matrix_filtered_peaks.txt”, “Final_aegypti_Ago2_matrix_filtered_peaks.txt”, and “Final_aegypti_Ago2_matrix_filtered_peaks.txt” were generated in PCA_annotation_fgsea_targetvenns script and are available in my Github
# cells_Ago1_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_cells_Ago1_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
#
# aegypti_Ago1_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_aegypti_Ago1_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# cells_Ago2_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_cells_Ago2_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# aegypti_Ago2_peaks_filt <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_aegypti_Ago2_matrix_filtered_peaks.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# all_with_pats$filtered_peak1 <- ifelse(all_with_pats$peakID %in% cells_Ago1_peaks_filt$peakID & all_with_pats$peakID %in% aegypti_Ago1_peaks_filt$peakID, paste0("Ago1_both"), paste0("NA"))
# all_with_pats$filtered_peak1 <- ifelse(all_with_pats$peakID %in% cells_Ago1_peaks_filt$peakID & !all_with_pats$peakID %in% aegypti_Ago1_peaks_filt$peakID, paste0("Ago1_cells"), paste0(all_with_pats$filtered_peak1))
# all_with_pats$filtered_peak1 <- ifelse(!all_with_pats$peakID %in% cells_Ago1_peaks_filt$peakID & all_with_pats$peakID %in% aegypti_Ago1_peaks_filt$peakID, paste0("Ago1_aegypti"), paste0(all_with_pats$filtered_peak1))
#
#
# all_with_pats$filtered_peak2 <- ifelse(all_with_pats$peakID %in% cells_Ago2_peaks_filt$peakID & all_with_pats$peakID %in% aegypti_Ago2_peaks_filt$peakID, paste0("Ago2_both"), paste0("NA"))
# all_with_pats$filtered_peak2 <- ifelse(all_with_pats$peakID %in% cells_Ago2_peaks_filt$peakID & !all_with_pats$peakID %in% aegypti_Ago2_peaks_filt$peakID, paste0("Ago2_cells"), paste0(all_with_pats$filtered_peak2))
# all_with_pats$filtered_peak2 <- ifelse(!all_with_pats$peakID %in% cells_Ago2_peaks_filt$peakID & all_with_pats$peakID %in% aegypti_Ago2_peaks_filt$peakID, paste0("Ago2_aegypti"), paste0(all_with_pats$filtered_peak2))
# all_with_pats$filtered_peak <- ifelse(all_with_pats$filtered_peak1!="NA" & all_with_pats$filtered_peak2!="NA", paste0(all_with_pats$filtered_peak1, "; ", all_with_pats$filtered_peak2), paste0("NA"))
# all_with_pats$filtered_peak <- ifelse(all_with_pats$filtered_peak1=="NA" & all_with_pats$filtered_peak2!="NA", paste0(all_with_pats$filtered_peak2), paste0(all_with_pats$filtered_peak))
# all_with_pats$filtered_peak <- ifelse(all_with_pats$filtered_peak1!="NA" & all_with_pats$filtered_peak2=="NA", paste0(all_with_pats$filtered_peak1), paste0(all_with_pats$filtered_peak))
# all_with_pats$filtered_peak1 <- NULL
# all_with_pats$filtered_peak2 <- NULL
#
# hiconfago2 <- all_with_pats[ grepl("Ago2_both|Ago2_aegypti", all_with_pats$filtered_peak),]
# hiconfago2 <- hiconfago2[!is.na(hiconfago2$aegyptiAgo2_chimera ) | !is.na(hiconfago2$PsiteOfPattern),] #936
#
# hiconfago1 <- all_with_pats[ grepl("Ago1_both|Ago1_aegypti", all_with_pats$filtered_peak),]
# hiconfago1 <- hiconfago1[!is.na(hiconfago1$aegyptiAgo1_chimera ) | !is.na(hiconfago1$PsiteOfPattern),]
# #1140
#
# hiconfago1cells <- all_with_pats[ grepl("Ago1_both|Ago1_cells", all_with_pats$filtered_peak),]
# hiconfago1cells <- hiconfago1cells[!is.na(hiconfago1cells$Aag2Ago1_chimera) | !is.na(hiconfago1cells$PsiteOfPattern),]
# #1902
#
# hiconfago2cells <- all_with_pats[ grepl("Ago2_both|Ago2_cells", all_with_pats$filtered_peak),]
# hiconfago2cells <- hiconfago2cells[!is.na(hiconfago2cells$Aag2Ago2_chimera) | !is.na(hiconfago2cells$PsiteOfPattern),]
# #1366
# hiconf <- rbind(hiconfago1cells,hiconfago2cells,hiconfago1, hiconfago2)
#
# all_with_pats$hi_confidence_target <- ifelse(all_with_pats$peakID %in% hiconfago1$peakID & all_with_pats$peakID %in% hiconfago1cells$peakID, paste0("Ago1_both"), paste0("NA"))
# all_with_pats$hi_confidence_target <- ifelse(!all_with_pats$peakID %in% hiconfago1$peakID & all_with_pats$peakID %in% hiconfago1cells$peakID, paste0("Ago1_cells"), paste0(all_with_pats$hi_confidence_target))
# all_with_pats$hi_confidence_target <- ifelse(all_with_pats$peakID %in% hiconfago1$peakID & !all_with_pats$peakID %in% hiconfago1cells$peakID, paste0("Ago1_aegypti"), paste0(all_with_pats$hi_confidence_target))
#
# all_with_pats$hi_confidence_target2 <- ifelse(all_with_pats$peakID %in% hiconfago2$peakID & all_with_pats$peakID %in% hiconfago2cells$peakID, paste0("Ago2_both"), paste0("NA"))
# all_with_pats$hi_confidence_target2 <- ifelse(!all_with_pats$peakID %in% hiconfago2$peakID & all_with_pats$peakID %in% hiconfago2cells$peakID, paste0("Ago2_cells"), paste0(all_with_pats$hi_confidence_target2))
# all_with_pats$hi_confidence_target2 <- ifelse(all_with_pats$peakID %in% hiconfago2$peakID & !all_with_pats$peakID %in% hiconfago2cells$peakID, paste0("Ago2_aegypti"), paste0(all_with_pats$hi_confidence_target2))
#
# all_with_pats$hi_confidence_target3 <- ifelse(all_with_pats$hi_confidence_target!="NA" & all_with_pats$hi_confidence_target2!="NA", paste0(all_with_pats$hi_confidence_target, "; ", all_with_pats$hi_confidence_target2), paste0("NA"))
# all_with_pats$hi_confidence_target3 <- ifelse(all_with_pats$hi_confidence_target3=="NA" & all_with_pats$hi_confidence_target!="NA", paste0(all_with_pats$hi_confidence_target), paste0(all_with_pats$hi_confidence_target3))
# all_with_pats$hi_confidence_target3 <- ifelse(all_with_pats$hi_confidence_target2!="NA" & all_with_pats$hi_confidence_target3 =="NA", paste0(all_with_pats$hi_confidence_target2), paste0(all_with_pats$hi_confidence_target3 ))
# all_with_pats$hi_confidence_target <- all_with_pats$hi_confidence_target3
# all_with_pats$hi_confidence_target2 <- NULL
# all_with_pats$hi_confidence_target3 <- NULL
# all_with_pats$filtered_peak1 <- NULL
#
# all_with_pats2 <- all_with_pats %>%
# mutate(Ptop_smallRNA = replace(Ptop_smallRNA, !grepl("_", Ptop_smallRNA), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Pseven_A1_target = replace(Pseven_A1_target, !grepl("-", Pseven_A1_target), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Pseven_M8_target = replace(Pseven_M8_target, !grepl("-", Pseven_M8_target), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Peight_target = replace(Peight_target, !grepl("-", Peight_target), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Peight_target_alt = replace(Peight_target_alt, !grepl("-", Peight_target_alt), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Pperfect_18mer_target = replace(Pperfect_18mer_target, !grepl("-", Pperfect_18mer_target), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(Pone_mis_18mer_target = replace(Pone_mis_18mer_target, !grepl("-", Pone_mis_18mer_target), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(top_smallRNA_chimera = replace(top_smallRNA_chimera, !grepl("_",top_smallRNA_chimera), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(filtered_peak = replace(filtered_peak, !grepl("_",filtered_peak), NA))
#
# all_with_pats2 <- all_with_pats2 %>%
# mutate(hi_confidence_target = replace(hi_confidence_target, !grepl("_",hi_confidence_target), NA))
#
# all_with_pats2$geneChr <- NULL
# all_with_pats2$geneStrand <- NULL
# names(all_with_pats2)[names(all_with_pats2)=="seqnames"] <- "chromosome"
#
# #write.table(all_with_pats2, "/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/Table_S4_Resource_paper.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
# S4_all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/Table_S4_Resource_paper.txt", header = TRUE, sep = "\t")
# S4 <- S4_all[!is.na(S4_all$hi_confidence_target),]
#
# S3 <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/Table_S3_Resource_paper.txt", header = TRUE, sep = "\t")
#
# #before small RNA family collapse
# uncoll <- read.delim( "/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/all_pred_all_info_082120_rn.txt", header = TRUE, sep = "\t")
#
#
# newt <- uncoll[uncoll$peakID %in% S4$peakID,] #5099 - missing chimeras
# #newt <- newt[,c(2,4, 24, 26:33)]
# #need to make a new column that summarizes the seed
# #perf are all NA, one mis are all NA
# newt$target_type <- ifelse(!is.na(newt$eight_target), paste0("8mer"), NA)
# #should be no 7M8 and 7A1 in common except 8mer
# check <- newt[is.na(newt$target_type),]
# #have 8mer nas but 7A1 and 7m8 are both true, why?
# seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds2.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE)
# seeds_df <- merge(S3[,c(1:2,38)], seed_table, by.x = "sequence", by.y = "FL", all.x =T)
# #ok, because 7a8 and A7a1 for some families are the same e.g. 11900, both are AAAAAAA; but 8mer is AAAAAAAA
# #check2 <- check[!is.na(check$seven_A1_target) & !is.na(check$sevenM8_target),]
#
# #so need to keep the one with the most support
# #if there is an 8mer for that smallRNA family, just put that for each peak
# newt$ID <- paste0(newt$peakID, "_", newt$miRNA_family)
# eightmer <- newt[!is.na(newt$target_type),]
# eightmer$dups <- duplicated(eightmer$ID)
# eightmer_rm <- eightmer[eightmer$dups=="FALSE",] #use this one
#
# #remove 8mers from same peak and miRNA family from other targets
# check$ID <- paste0(check$peakID, "_", check$miRNA_family)
# check <- check[!check$ID %in% eightmer_rm$ID,]
#
# #now with check, classify 7mer types
# #can also be because pattern of 6mer match is the same, but have both 7mer-A1 nd 7mer-m8 in same ext 18mer seq at different positions, but not 8mer
# both7 <- check[!is.na(check$seven_A1_target) & !is.na(check$sevenM8_target),]
# both7$dups <- duplicated(both7$ID)
# both7$target_type <- paste0("7A1/7M8mer")
#
# usevens <- check[!check$ID %in% both7$ID,]
# usevens$dups <- duplicated(usevens$ID)
# #length(which(!is.na(usevens$seven_A1_target) & !is.na(usevens$sevenM8_target))) 0, good
# sevenA1 <- usevens[!is.na(usevens$seven_A1_target),]
# sevenA1$dups <- duplicated(sevenA1$ID)
# sevenA1 <- sevenA1[sevenA1$dups=="FALSE",]
#
# sevenm8 <- usevens[!is.na(usevens$sevenM8_target),]
# sevenm8$dups <- duplicated(sevenm8$ID)
# sevenm8 <- sevenm8[sevenm8$dups=="FALSE",]
#
# sevenA1$dups <- sevenA1$ID %in% sevenm8$ID
# #so if in both, is supported by both 7mer-m8 and 7mer-A1 otherwise is just one or the other
#
# sevenm8$dups <- sevenm8$ID %in% sevenA1$ID
# sevenA1$target_type <- ifelse(sevenA1$dups=="TRUE", paste0("7A1/7M8mer"), paste0("7A1mer"))
# sevenm8$target_type <- ifelse(sevenm8$dups=="TRUE", paste0("7A1/7M8mer"), paste0("7M8mer"))
# sevens <- rbind(sevenm8,sevenA1)
# sevens$dups <- duplicated(sevens$ID)
# sevens <- sevens[sevens$dups=="FALSE",]
#
# sofar <- rbind(eightmer_rm, both7, sevens) #checked and all peakID/sRNA family are unique, good
#
# six <- newt[!newt$ID %in% sofar$ID,]
# six$dups <- duplicated(six$ID)
# six <- six[six$dups=="FALSE",]
# six$target_type <- paste0("6mer")
#
# pred <- rbind(six, sofar) #checked and all unique peakID/sRNA, good, and no NAs
#
# #then need to do the same format for chimera - if in common, put both; otherwise, just rbind
#
# #so first, only need to consider chimera if it is in peak of interest
# all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# #for Fig 4, start by trying normalizing to reads in all peaks
# norm <- colSums(all[,219:226])
# all$log2aegyptiAgo1_norm_target <- log2(((all$aegyptiAgo1_counts+1)/7297949)*1E6)
# all$log2aegyptiAgo2_norm_target <- log2(((all$aegyptiAgo2_counts+1)/1481167)*1E6)
# all$log2aegyptirIgG_norm_target <- log2(((all$aegyptirIgG_counts+1)/777251)*1E6)
# all$log2aegyptimIgG_norm_target <- log2(((all$aegyptimIgG_counts+1)/221290)*1E6)
# all$log2Aag2Ago1_norm_target <- log2(((all$Aag2Ago1_counts+1)/10744772)*1E6)
# all$log2Aag2Ago2_norm_target <- log2(((all$Aag2Ago2_counts+1)/2851099)*1E6)
# all$log2Aag2rIgG_norm_target <- log2(((all$Aag2rIgG_counts+1)/1122306)*1E6)
# all$log2Aag2mIgG_norm_target <- log2(((all$Aag2mIgG_counts+1)/1060819)*1E6)
#
#
# #make GR for seed search
# allGR <- makeGRangesFromDataFrame(all,
# keep.extra.columns=TRUE,
# ignore.strand=FALSE,
# seqinfo=NULL,
# seqnames.field="seqnames",
# start.field="start",
# end.field="end",
# strand.field="strand",
# starts.in.df.are.0based=FALSE)all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
#
# #just to keep formatting the same
# norm <- colSums(all[,219:226])
# all$log2aegyptiAgo1_norm_target <- log2(((all$aegyptiAgo1_counts+1)/7297949)*1E6)
# all$log2aegyptiAgo2_norm_target <- log2(((all$aegyptiAgo2_counts+1)/1481167)*1E6)
# all$log2aegyptirIgG_norm_target <- log2(((all$aegyptirIgG_counts+1)/777251)*1E6)
# all$log2aegyptimIgG_norm_target <- log2(((all$aegyptimIgG_counts+1)/221290)*1E6)
# all$log2Aag2Ago1_norm_target <- log2(((all$Aag2Ago1_counts+1)/10744772)*1E6)
# all$log2Aag2Ago2_norm_target <- log2(((all$Aag2Ago2_counts+1)/2851099)*1E6)
# all$log2Aag2rIgG_norm_target <- log2(((all$Aag2rIgG_counts+1)/1122306)*1E6)
# all$log2Aag2mIgG_norm_target <- log2(((all$Aag2mIgG_counts+1)/1060819)*1E6)
#
#
# #make GR for seed search
# allGR <- makeGRangesFromDataFrame(all,
# keep.extra.columns=TRUE,
# ignore.strand=FALSE,
# seqinfo=NULL,
# seqnames.field="seqnames",
# start.field="start",
# end.field="end",
# strand.field="strand",
# starts.in.df.are.0based=FALSE)
# allGR70 <- resize(allGR,width=70,fix="center")
# allGR70@elementMetadata <- allGR70@elementMetadata[,c(1, 196:204)]
#
# #read in chimeras
# Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
# allmychimera <- dir(Dir,full.names = TRUE,pattern="*_chimera_remap.bed")
# chimbed <- lapply(allmychimera, import.bed)
# names(chimbed) <- gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped/", "", allmychimera)
# names(chimbed) <- gsub("_remap.bed", "", names(chimbed))
# #subset chimeras by overlaps
#
# chimbed_merge <- lapply(chimbed, function(x) mergeByOverlaps(x, allGR70, type="within"))
#
# test <- lapply(chimbed_merge, function(x) as.data.frame(cbind(as.character(x$name), as.character(x$peakID ))))
#
# col.names <- c("chim_miRNA", "peakID")
# chim_ls <- lapply(test, setNames, col.names)
# #chim_ls <- lapply(chim_ls, setNames, col.names)
# t <- lapply(chim_ls, function(x) smallRNA <- str_split_fixed(x$chim_miRNA, ";", 2)[,2])
# t <- lapply(t, as.data.frame)
# t <- lapply(t, setNames, c("smallRNA"))
#
# x <- mapply(cbind, chim_ls, t, SIMPLIFY=FALSE)
#
# #merge small RNA name from chim with final name annotation
# listm <- seed_table[,c(1:2)]
# listm$reform <- ifelse(grepl("novel", listm$smallRNA), paste0(listm$sequence), paste0(listm$smallRNA))
#
# for (i in 1:length(x)) {
# x[[i]] <- merge(x[[i]], listm, by.x = "smallRNA",by.y = "reform", all.x = TRUE)
# }
#
#
# test <- lapply(x, function(x) x %>% group_by(.dots=c("peakID", "smallRNA.y")) %>% summarize(count=n()))
# test <- lapply(test, as.data.frame)
#
# test <- lapply(test, function(x) x[!is.na(x$smallRNA.y),])
# lim <- test[!grepl("IgG", names(test))]
#
# #need to change to family name
# lim <- lapply(lim, function(x) merge(x, seeds_df[,c(1, 38)], by.x="smallRNA.y", by.y="smallRNA", all.x=T))
# see <- vector("list", 4)
# for(i in 1:length(lim)) {
# lim[[i]]$ID <- paste0(lim[[i]]$peakID, "_", lim[[i]]$aae_smallRNA_family)
# }
#
# #checked that all names match with pred
# cells_Ago1_chim <- lim$Aag2Ago1_chimera
# cells_Ago2_chim <- lim$Aag2Ago2_chimera
# aae_Ago1_chim <- lim$aegyptiAgo1_chimera
# aae_Ago2_chim <- lim$aegyptiAgo2_chimera
#
#
# cells_Ago1_peaks_filt <- subset(S4$peakID, grepl("Ago1_cells", S4$hi_confidence_target))
# cells_Ago2_peaks_filt <- subset(S4$peakID, grepl("Ago2_cells", S4$hi_confidence_target))
# aegypti_Ago1_peaks_filt <- subset(S4$peakID, grepl("Ago1_aegypti", S4$hi_confidence_target))
# aegypti_Ago2_peaks_filt <- subset(S4$peakID, grepl("Ago2_aegypti", S4$hi_confidence_target))
#
# #get only filt peaks
# cells_Ago1_chim <- cells_Ago1_chim[cells_Ago1_chim$peakID %in% cells_Ago1_peaks_filt,]
# cells_Ago2_chim <- cells_Ago2_chim[cells_Ago2_chim$peakID %in% cells_Ago2_peaks_filt,]
# aae_Ago1_chim <- aae_Ago1_chim[aae_Ago1_chim$peakID %in% aegypti_Ago1_peaks_filt,]
# aae_Ago2_chim <- aae_Ago2_chim[aae_Ago2_chim$peakID %in% aegypti_Ago2_peaks_filt,]
#
# #after target type, merge with transcript name; then want for what protein
#
# pred$sample <- ifelse(pred$peakID %in% aegypti_Ago1_peaks_filt, paste0("aegypti_AGO1"), NA)
# pred$sample2 <- ifelse(pred$peakID %in% aegypti_Ago2_peaks_filt, paste0("aegypti_AGO2"),NA)
# pred$sample3 <- ifelse(pred$peakID %in% cells_Ago1_peaks_filt, paste0("Aag2_AGO1"),NA)
# pred$sample4 <- ifelse(pred$peakID %in% cells_Ago2_peaks_filt, paste0("Aag2_AGO2"),NA)
#
# pred$chim3 <- ifelse(pred$ID %in% cells_Ago1_chim$ID & pred$sample3=="Aag2_AGO1", paste0("Aag2_AGO1"), NA)
# pred$chim <- ifelse(pred$ID %in% aae_Ago1_chim$ID & pred$sample=="aegypti_AGO1", paste0("aegypti_AGO1"), NA)
# pred$chim2 <- ifelse(pred$ID %in% aae_Ago2_chim$ID & pred$sample2=="aegypti_AGO2", paste0("aegypti_AGO2"), NA)
# pred$chim4 <- ifelse(pred$ID %in% cells_Ago2_chim$ID & pred$sample4=="Aag2_AGO2", paste0("Aag2_AGO2"), NA)
#
#
# pred_lim <- pred[,c(2,4,24,34:44)]
# names(pred_lim )[names(pred_lim) == "miRNA_family"] <- "aae_smallRNA_family"
#
# #need to add chim that are not also pred
# cells_Ago1_chim <- cells_Ago1_chim[!cells_Ago1_chim$ID %in% subset(pred_lim$ID, pred_lim$chim3=="Aag2_AGO1"),]
# cells_Ago2_chim <- cells_Ago2_chim[!cells_Ago2_chim$ID %in% subset(pred_lim$ID, pred_lim$chim4=="Aag2_AGO2"),]
# aae_Ago1_chim <- aae_Ago1_chim[!aae_Ago1_chim$ID %in% subset(pred_lim$ID, pred_lim$chim=="aegypti_AGO1"),]
# aae_Ago2_chim <- aae_Ago2_chim[!aae_Ago2_chim$ID %in% subset(pred_lim$ID, pred_lim$chim2=="aegypti_AGO2"),]
#
#
# #need to format each chimera set to match predlim and Rbind all together
# #first remove fam dups
# cells_Ago1_chim$count <- NULL
# cells_Ago1_chim <- cells_Ago1_chim[!duplicated(cells_Ago1_chim$ID),]
#
# cells_Ago2_chim <- cells_Ago2_chim[!duplicated(cells_Ago2_chim$ID),]
# cells_Ago2_chim$count <- NULL
#
# aae_Ago2_chim <- aae_Ago2_chim[!duplicated(aae_Ago2_chim$ID),]
# aae_Ago2_chim$count <- NULL
#
# aae_Ago1_chim <- aae_Ago1_chim[!duplicated(aae_Ago1_chim$ID),]
# aae_Ago1_chim$count <- NULL
#
# #first simplify samples and chimera columns
# pred_lim$sample_cat <- paste0(pred_lim$sample)
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample2) & pred_lim$sample_cat=="aegypti_AGO1", paste0(pred_lim$sample_cat, "; ", pred_lim$sample2), paste0(pred_lim$sample_cat))
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample2) & pred_lim$sample_cat=="NA", paste0(pred_lim$sample2), pred_lim$sample_cat)
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample3) & pred_lim$sample_cat!="NA", paste0(pred_lim$sample_cat, "; ",pred_lim$sample3), pred_lim$sample_cat)
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample3) & pred_lim$sample_cat=="NA", paste0(pred_lim$sample3), pred_lim$sample_cat)
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample4) & pred_lim$sample_cat!="NA", paste0(pred_lim$sample_cat, "; ",pred_lim$sample4), pred_lim$sample_cat)
#
# pred_lim$sample_cat <- ifelse(!is.na(pred_lim$sample4) & pred_lim$sample_cat=="NA", paste0(pred_lim$sample4), pred_lim$sample_cat)
#
# pred_lim$sample <- NULL
# pred_lim$sample2 <- NULL
# pred_lim$sample3 <- NULL
# pred_lim$sample4 <- NULL
#
# names(pred_lim )[names(pred_lim) == "sample_cat"] <- "sample"
#
# pred_lim$chim_cat <- paste0(pred_lim$chim)
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim2) & pred_lim$chim_cat=="aegypti_AGO1", paste0(pred_lim$chim_cat, "; ", pred_lim$chim2), paste0(pred_lim$chim_cat))
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim2) & pred_lim$chim_cat=="NA", paste0(pred_lim$chim2), pred_lim$chim_cat)
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim3) & pred_lim$chim_cat!="NA", paste0(pred_lim$chim_cat, "; ",pred_lim$chim3), pred_lim$chim_cat)
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim3) & pred_lim$chim_cat=="NA", paste0(pred_lim$chim3), pred_lim$chim_cat)
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim4) & pred_lim$chim_cat!="NA", paste0(pred_lim$chim_cat, "; ",pred_lim$chim4), pred_lim$chim_cat)
#
# pred_lim$chim_cat <- ifelse(!is.na(pred_lim$chim4) & pred_lim$chim_cat=="NA", paste0(pred_lim$chim4), pred_lim$chim_cat)
#
# pred_lim$chim <- NULL
# pred_lim$chim2 <- NULL
# pred_lim$chim3 <- NULL
# pred_lim$chim4 <- NULL
# pred_lim$dups <- NULL
#
# names(pred_lim )[names(pred_lim) == "chim_cat"] <- "chimera"
# pred_lim$target_type <- ifelse(pred_lim$chimera!="NA", paste0(pred_lim$target_type, "; chimera"), pred_lim$target_type)
#
#
# #rbind chim
# cells_Ago1_chim$sample <- paste0("Aag2_AGO1")
# cells_Ago2_chim$sample <- paste0("Aag2_AGO2")
# aae_Ago1_chim$sample <- paste0("aegypti_AGO1")
# aae_Ago2_chim$sample <- paste0("aegypti_AGO2")
#
# cells_Ago1_chim$chimera <- paste0("Aag2_AGO1")
# cells_Ago2_chim$chimera <- paste0("Aag2_AGO2")
# aae_Ago1_chim$chimera <- paste0("aegypti_AGO1")
# aae_Ago2_chim$chimera <- paste0("aegypti_AGO2")
#
# all_chim <- rbind(cells_Ago1_chim, cells_Ago2_chim, aae_Ago2_chim, aae_Ago1_chim)
# all_chim$smallRNA.y <- NULL
# all_chim$target_type <- paste0("chimera")
# all_chim$dups <- duplicated(all_chim$ID)
#
# #fix ones that were called for same peak as chimeras for both AGO1 aand AGO2
# all_chim$sample[all_chim$ID == "2:241249939_241249961:+_aae-let-7"] <- "aegypti_AGO1; aegypti_AGO2"
# all_chim$chimera[all_chim$ID == "2:241249939_241249961:+_aae-let-7"] <- "aegypti_AGO1; aegypti_AGO2"
#
# all_chim$sample[all_chim$ID == "3:113630333_113630346:-_aae-miR-184"] <- "aegypti_AGO1; aegypti_AGO2"
# all_chim$chimera[all_chim$ID == "3:113630333_113630346:-_aae-miR-184"] <- "aegypti_AGO1; aegypti_AGO2"
#
# all_chim$sample[all_chim$ID == "2:19218331_19218369:-_aae-miR-277-3p"] <- "aegypti_AGO1; aegypti_AGO2"
# all_chim$chimera[all_chim$ID == "2:19218331_19218369:-_aae-miR-277-3p"] <- "aegypti_AGO1; aegypti_AGO2"
#
# all_chim$sample[all_chim$ID == "2:272474588_272474602:+_aae-miR-965"] <- "aegypti_AGO1; aegypti_AGO2"
# all_chim$chimera[all_chim$ID == "2:272474588_272474602:+_aae-miR-965"] <- "aegypti_AGO1; aegypti_AGO2"
# all_chim <- all_chim[all_chim$dups=="FALSE",]
#
# all_chim$ann_group <- NA
# all_chim$dups <- NULL
# tog <- rbind(pred_lim, all_chim)
#
# tog$dups <- duplicated(tog$ID) #checked and no dups
#
# aa1 <- tog[grepl("aegypti_AGO1", tog$sample),]
# aegypti_Ago1_peaks_filt %in% unique(aa1$peakID)
#
# aa2 <- tog[grepl("aegypti_AGO2", tog$sample),]
# aegypti_Ago2_peaks_filt %in% unique(aa2$peakID)
#
# ca1 <- tog[grepl("Aag2_AGO1", tog$sample),]
# cells_Ago1_peaks_filt %in% unique(ca1$peakID)
#
# ca2 <- tog[grepl("Aag2_AGO2", tog$sample),]
# cells_Ago2_peaks_filt %in% unique(ca2$peakID)
# tog$dups <- NULL
# tog$ann_group <- NULL
# tog2 <- merge(tog, S4[,c(1:7, 12:13,64,81)], by = "peakID", all.x=T)
#
# tog2 <- tog2[,c(2:3, 1,7:14,5:6,15:16)]
#
# names(tog2)[names(tog2) == "Ptop_smallRNA"] <- "top_smallRNA"
#
# tog2$top_smallRNA <- NULL
# tog2$top_smallRNA_chimera <- NULL
# tog2$target_type <- ifelse(grepl("7A1", tog2$target_type), gsub("7A1mer", "7merA1", tog2$target_type), paste0(tog2$target_type))
# tog2$target_type <- ifelse(grepl("7M8", tog2$target_type), gsub("7M8mer", "7merM8", tog2$target_type), paste0(tog2$target_type))
#
# #write.table(tog2, "/Users/kathryn/Reprocess_all_paper_datasets/Mol_Cell_resub/Revision_files/Supplementary_Tables/Table_S5_lim.txt", col.names = T, row.names = F, quote = F, sep = "\t")
Output table is the RNAi network map of high-confidence targets provided in the paper (Table S5)
#
# #fix Ptop_small_NA column for Table S5 ext
#
# S4_all$top_smallRNA_chimera <- gsub("Ago", "AGO", S4_all$top_smallRNA_chimera)
#
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago1_cells", S4_all$Ptop_smallRNA) & grepl("Ago1_aegypti", S4_all$Ptop_smallRNA) & !grepl("Ago1_both", S4_all$Ptop_smallRNA), paste0("Ago1_both; ", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago1_both", S4_all$Ptop_smallRNA), gsub("Ago1_cells; ", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago1_both", S4_all$Ptop_smallRNA), gsub("; Ago1_cells", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago1_both", S4_all$Ptop_smallRNA), gsub("Ago1_aegypti; ", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago1_both", S4_all$Ptop_smallRNA), gsub("; Ago1_aegypti", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
#
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago2_cells", S4_all$Ptop_smallRNA) & grepl("Ago2_aegypti", S4_all$Ptop_smallRNA) & !grepl("Ago2_both", S4_all$Ptop_smallRNA), paste0("Ago2_both; ", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago2_both", S4_all$Ptop_smallRNA), gsub("Ago2_cells; ", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago2_both", S4_all$Ptop_smallRNA), gsub("; Ago2_cells", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago2_both", S4_all$Ptop_smallRNA), gsub("Ago2_aegypti; ", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
# S4_all$Ptop_smallRNA <- ifelse(grepl("Ago2_both", S4_all$Ptop_smallRNA), gsub("; Ago2_aegypti", "", S4_all$Ptop_smallRNA), paste0(S4_all$Ptop_smallRNA))
#
# S4_all$Ptop_smallRNA <- gsub("Ago", "AGO", S4_all$Ptop_smallRNA)
#
# #write.table(S4_all, "/Users/kathryn/Reprocess_all_paper_datasets/Mol_Cell_resub/Revision_files/Supplementary_Tables/Table_S5_ext.txt", col.names = T, row.names = F, quote = F, sep = "\t")
The final extended output table, “Table_S5_ext.txt.zip” is available in my GitHub; the legend is also uploaded