RNAseq_target_validation

Select Ago1 targets for ovary-expressed miRNAs

input file “Table_S5_ext.txt” is available as a zip file in my Github

all_with_pats <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Mol_cell_resub/Revision_files/Supplementary_Tables/Table_S5_ext.txt", header = TRUE, sep="\t", stringsAsFactors = FALSE)

all_merge_aegypti_Ago1 <- subset(all_with_pats, all_with_pats$aegypti_Ago1_BCsample > 0)
aegypti_Ago1_3UTR <- all_merge_aegypti_Ago1[grepl( "3' UTR", all_merge_aegypti_Ago1$annotation),]

Select miRNAs

used Table S18 and S19 from Akbari et al., 2013

#miRNAs that are high in the ovaries (>5000 RPM) and rare in the carcass (<1000 RPM)

##miR989
all_3UTR_989 <- rbind(aegypti_Ago1_3UTR[grepl("miR-989x", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-989", aegypti_Ago1_3UTR$PsmallRNA_family),])

##miR996
all_3UTR_996 <- rbind(aegypti_Ago1_3UTR[grepl("miR-996x", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-996", aegypti_Ago1_3UTR$PsmallRNA_family),])

##miR2946
all_3UTR_2946 <- rbind(aegypti_Ago1_3UTR[grepl("miR-2946x", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-2941", aegypti_Ago1_3UTR$PsmallRNA_family),]) #2946 shares seed with 2941, family name is 2941

##also select control miRNAs that are rare in the ovaries (<1000 RPM) and high in the carcass (>4000 RPM)

##miR981
all_3UTR_981 <- rbind(aegypti_Ago1_3UTR[grepl("miR-981x", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-981", aegypti_Ago1_3UTR$PsmallRNA_family),])

##miR7
all_3UTR_7<- rbind(aegypti_Ago1_3UTR[grepl("miR-7x", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-7", aegypti_Ago1_3UTR$PsmallRNA_family),])
#need to remove 79/71
all_3UTR_7 <- all_3UTR_7[!grepl("aae-miR-71", all_3UTR_7$PsmallRNA_family),]
all_3UTR_7 <- all_3UTR_7[!grepl("aae-miR-79", all_3UTR_7$PsmallRNA_family),]

##277 is high in the head, maybe can see that in brain or other head tissues - just for reviewer
all_3UTR_277<- rbind(aegypti_Ago1_3UTR[grepl("miR-277-3px", aegypti_Ago1_3UTR$aegyptiAgo1_chimera),], aegypti_Ago1_3UTR[grepl("miR-277-3p", aegypti_Ago1_3UTR$PsmallRNA_family),])
all_3UTR_277 <- all_3UTR_277[,c("geneId", "transcriptId")]
all_3UTR_277 <- unique(all_3UTR_277)


#write out gene names
h <-grep("all_3UTR",names(.GlobalEnv),value=TRUE)
h<- do.call("list",mget(h))

h <- lapply(h, function(x) x[,c("geneId", "transcriptId")])
hu <- lapply(h, unique)

#run this to write out gene lists:
# for(i in seq_along(hu)) {
#   write.table(hu[i], paste0("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago1_lists_FINAL/", names(hu)[i], ".txt"), 
#               col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

#write.table(all_3UTR_277, "/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago1_lists_FINAL/all_3UTR_277.txt", col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

Ago1 miRNA target gene lists “all_3UTR_2946.txt”,“all_3UTR_7.txt”, “all_3UTR_981.txt”, “all_3UTR_989.txt”, “all_3UTR_996.txt” are available in my Github in the folder “Ago1_lists_FINAL”

Get Ovary high and low genes as controls

Data were obtained from Table S6 from Matthews et al., 2016
Alternatively, you can access the file “Matthews_S6.txt” with RNAseq TPM for all tissues on my Github

diff_genes <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Matthews_S6.txt", header = TRUE, sep = "\t")

Fe_Ov_all <- grep("Fe_Ov", colnames(diff_genes), value = TRUE)
Fe_all <- grep("Fe", colnames(diff_genes), value = TRUE)

diff_genes$Ov_av_all <- rowMeans(diff_genes[Fe_Ov_all])
diff_genes$all_Fe_notOv <- rowMeans(diff_genes[Fe_all])

diff_genes$Ov_hi <- diff_genes$Ov_av_all/diff_genes$all_Fe
diff_genes$Ov_lo <- diff_genes$all_Fe/diff_genes$Ov_av_all

Ov_hi <- diff_genes %>% top_n(50 , Ov_hi) %>% dplyr::select(gene)
Ov_lo <- diff_genes %>% top_n(-50 , Ov_hi) %>% dplyr::select(gene)

h <-grep("Ov",names(.GlobalEnv),value=TRUE)
h <- grep("lo|hi", h, value = TRUE)
h<- do.call("list",mget(h))
hu <- lapply(h, unique)
#for(i in seq_along(hu)) {
#  write.table(hu[i], #paste0("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago1_lists_FI#NAL/", names(hu)[i], ".txt"), 
#              col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
#}

Control gene lists “Ov_hi2.txt” and “Ov_lo2.txt” are available in my Github in the folder “Ago1_lists_FINAL”

GSVA for Ago1 ovary-specific miRNA targets (Figure S7C)

Input files are available for download from the Vosshall lab Github associated with the Matthews et al., 2016 paper
Alternatively, you can go to my Github to access all the necessary data

PECounts <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Aedes_neurotranscriptome-home-newDESEQ2/raw_gene_counts/PE_featurecounts.txt",sep = "\t",skip=1)
SECounts <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Aedes_neurotranscriptome-home-newDESEQ2/raw_gene_counts/SE_featurecounts.txt",sep = "\t",skip=1)
PECounts2 <- PECounts[,grepl("bam",colnames(PECounts))] %>% as.matrix %>% set_colnames(gsub("_\\..*","",colnames(.))) %>% set_rownames(PECounts$Geneid)
SECounts2 <- SECounts[,grepl("bam",colnames(SECounts))] %>% as.matrix %>% set_colnames(gsub("_merged.bam|bams\\.|Aligned.*|_SE","",colnames(.))) %>% set_rownames(SECounts$Geneid)
annotation <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Aedes_neurotranscriptome-home-newDESEQ2/annotations/annotation_with_orthodb-20150730.csv",sep=",")

myAgo1List <- dir("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago1_lists_FINAL/",full.names = TRUE) %>% lapply(function(x)read.delim(x,header=TRUE)[,1] %>% as.vector)

#map to their geneIDs
Ago1setList <- lapply(myAgo1List,function(x) unique(annotation[annotation[,2] %in% x,1]) %>% na.omit %>% as.vector)
names(Ago1setList) <- dir("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago1_lists_FINAL/") %>% gsub("\\.txt","",.)

SECounts2 <- SECounts2[,!colnames(SECounts2) %in% c("Fe_An_BF_1","Fe_An_SF_5")]
myNewCounts <- merge(SECounts2,PECounts2,by=0)

myNewCounts <- myNewCounts[,!colnames(myNewCounts) %in% "Fe_Br_SF_2"]
metaData<- data.frame(Group=gsub("_\\d$","",colnames(myNewCounts)),row.names = colnames(myNewCounts))
metaData2 <- data.frame(Group=gsub("_\\d$","",colnames(myNewCounts)),
                        Paired=ifelse(colnames(myNewCounts) %in% colnames(PECounts2),"Paired","Single"),
                        row.names = colnames(myNewCounts))

myNewCounts2 <- myNewCounts[,-1]
rownames(myNewCounts2) <- myNewCounts[,1]
myCounts2 <- edgeR::cpm(myNewCounts2)
require(GSVA)

Fe <- grep("Fe", colnames(myCounts2), value = TRUE)
mhCountsAgo1 <- gsva(myCounts2[,colnames(myCounts2) %in% Fe],gset.idx.list = Ago1setList[grepl("UTR|Ov", names(Ago1setList))],kcdf=c("Poisson"))

## Warning in .local(expr, gset.idx.list, ...): 317 genes with constant
## expression values throuhgout the samples.

## Warning in .local(expr, gset.idx.list, ...): Since argument method!
## ="ssgsea", genes with constant expression values are discarded.

## Estimating GSVA scores for 8 gene sets.
## Computing observed enrichment scores
## Estimating ECDFs with Poisson kernels
## Using parallel with 12 cores
## 
  |                                                                       
  |                                                                 |   0%

#visualize with heatmap
colors <- metaData2
colors <- as.data.frame(unique(colors$Group))
colors <- setNames(colors, c("Group"))
colors$color <- ifelse(grepl("Ov_O", colors$Group), paste0("lightslateblue"), paste0(colors$color))
colors$color <- ifelse(grepl("Ov_SF", colors$Group), paste0("plum"), paste0(colors$color))
mycolors <- as.character(colors$color) 
names(mycolors) <- colors$Group
mycolors <- list(Group = mycolors)
mycolors$Paired <- c("gray50", "black")
names(mycolors$Paired) <- c("Paired", "Single")

paletteLength <- 50
palAgo1 <- colorRampPalette(c( "#08306B","white", "darkorange"))(paletteLength)
myBreaks <- c(seq(min(mhCountsAgo1), 0, length.out=ceiling(paletteLength/2) + 1), seq(max(mhCountsAgo1)/paletteLength, max(mhCountsAgo1), length.out=floor(paletteLength/2)))

pheatmap:::pheatmap(mhCountsAgo1,
                    annotation_col = metaData2,
                     color = palAgo1, breaks = myBreaks,
                    cluster_cols = TRUE,cluster_rows = TRUE, scale="none", fontsize_col = 6, annotation_colors = mycolors, treeheight_row = 20, fontsize = 6, cellheight = 25, cellwidth = 6)

eCDFs of miRNA target expression, Figures 7A-7C, S7A and S7B

Need to normalize by transcript length for this analysis, use gtf with Matthews et al. gene IDs As above, input files are available for download from the Vosshall lab Github associated with the Matthews et al., 2016 paper
Alternatively, you can go to my Github to access all the necessary data

gtffile <- "/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Aedes_neurotranscriptome-home-newDESEQ2/annotations/STAR_featureCount.gtf"

TxDb <- makeTxDbFromGFF(gtffile)

## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ... OK

myLens <- transcriptLengths(TxDb) %>% group_by(gene_id) %>% summarise(gene_len=median(tx_len))

myNewCounts3 <- merge(myNewCounts2, myLens, by.x=0, by.y= "gene_id", all.x=TRUE)
row.names(myNewCounts3) <- myNewCounts3$Row.names

d <- DGEList(counts = myNewCounts3[,c(2:130)])
d$genes$Length <- c(myNewCounts3$gene_len)
myCountstest <- edgeR::rpkm(d)
myCountstest <- as.data.frame(myCountstest)


Fe_all <- grep("Fe", colnames(myCountstest), value = TRUE)
Fe_Ov <- grep("Ov", colnames(myCountstest), value = TRUE)
Fe_Ov_SF <- grep("SF", Fe_Ov, value = TRUE)
Fe_Ov_O <- grep("SF", Fe_Ov, value = TRUE, invert = TRUE)

myCountstest$Fe_Ov_SF_av <- rowMeans(myCountstest[Fe_Ov_SF])
myCountstest$Fe_Ov_O_av <- rowMeans(myCountstest[Fe_Ov_O])
myCountstest$Fe_mean <- rowMeans(myCountstest[Fe_all])

#high in ovary miRNAs
##miR-989
test989 <- myCountstest[row.names(myCountstest) %in% Ago1setList$all_3UTR_989,]
test989<- test989[grepl("mean|av", colnames(test989))]
test989<- melt(test989)

## No id variables; using all as measure variables

ggplot(test989, aes(log2(value), colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("plum", "lightslateblue","gray")) + theme_bw()

## Warning: Removed 3 rows containing non-finite values (stat_ecdf).

wilcox.test(subset(test989$value, test989$variable=="Fe_Ov_O_av") , subset(test989$value, test989$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test989$value, test989$variable == "Fe_Ov_O_av") and subset(test989$value, test989$variable == "Fe_mean")
## W = 3675.5, p-value = 0.02335
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(test989$value, test989$variable=="Fe_Ov_SF_av") , subset(test989$value, test989$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test989$value, test989$variable == "Fe_Ov_SF_av") and subset(test989$value, test989$variable == "Fe_mean")
## W = 3585.5, p-value = 0.01287
## alternative hypothesis: true location shift is less than 0

##miR-996
test996 <- myCountstest[row.names(myCountstest) %in% Ago1setList$all_3UTR_996,]
test996<- test996[grepl("mean|av", colnames(test996))]
test996<- melt(test996)

## No id variables; using all as measure variables

ggplot(test996, aes(log2(value), colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("plum", "lightslateblue","gray")) + theme_bw()

wilcox.test(subset(test996$value, test996$variable=="Fe_Ov_O_av") , subset(test996$value, test996$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test996$value, test996$variable == "Fe_Ov_O_av") and subset(test996$value, test996$variable == "Fe_mean")
## W = 8334, p-value = 0.05469
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(test996$value, test996$variable=="Fe_Ov_SF_av") , subset(test996$value, test996$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test996$value, test996$variable == "Fe_Ov_SF_av") and subset(test996$value, test996$variable == "Fe_mean")
## W = 8082, p-value = 0.02356
## alternative hypothesis: true location shift is less than 0

##miR-2946
test2946 <- myCountstest[row.names(myCountstest) %in% Ago1setList$all_3UTR_2946,]
test2946<- test2946[grepl("mean|av", colnames(test2946))]
test2946<- melt(test2946)

## No id variables; using all as measure variables

ggplot(test2946, aes(log2(value), colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("plum", "lightslateblue","gray")) + theme_bw()

## Warning: Removed 3 rows containing non-finite values (stat_ecdf).

wilcox.test(subset(test2946$value, test2946$variable=="Fe_Ov_O_av") , subset(test2946$value, test2946$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test2946$value, test2946$variable == "Fe_Ov_O_av") and subset(test2946$value, test2946$variable == "Fe_mean")
## W = 2008.5, p-value = 0.2209
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(test2946$value, test2946$variable=="Fe_Ov_SF_av") , subset(test2946$value, test2946$variable=="Fe_mean"),
            alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test2946$value, test2946$variable == "Fe_Ov_SF_av") and subset(test2946$value, test2946$variable == "Fe_mean")
## W = 1948.5, p-value = 0.1487
## alternative hypothesis: true location shift is less than 0

###control miRNAs (low in ovary)
##miR-7
test7 <- myCountstest[row.names(myCountstest) %in% Ago1setList$all_3UTR_7,]
test7<- test7[grepl("mean|av", colnames(test7))]
test7<- melt(test7)

## No id variables; using all as measure variables

ggplot(test7, aes(log2(value), colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("plum", "lightslateblue","gray")) + theme_bw()

## Warning: Removed 3 rows containing non-finite values (stat_ecdf).

wilcox.test(subset(test7$value, test7$variable=="Fe_Ov_O_av"), subset(test7$value, test7$variable=="Fe_mean"), alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test7$value, test7$variable == "Fe_Ov_O_av") and subset(test7$value, test7$variable == "Fe_mean")
## W = 1514.5, p-value = 0.506
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(test7$value, test7$variable=="Fe_Ov_SF_av"), subset(test7$value, test7$variable=="Fe_mean"), alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test7$value, test7$variable == "Fe_Ov_SF_av") and subset(test7$value, test7$variable == "Fe_mean")
## W = 1458.5, p-value = 0.3745
## alternative hypothesis: true location shift is less than 0

#miR-981
test981 <- myCountstest[row.names(myCountstest) %in% Ago1setList$all_3UTR_981,]
test981<- test981[grepl("mean|av", colnames(test981))]
test981<- melt(test981)

## No id variables; using all as measure variables

ggplot(test981, aes(log2(value), colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("plum", "lightslateblue","gray")) + theme_bw()

wilcox.test(subset(test981$value, test981$variable=="Fe_Ov_O_av"), subset(test981$value, test981$variable=="Fe_mean"), alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test981$value, test981$variable == "Fe_Ov_O_av") and subset(test981$value, test981$variable == "Fe_mean")
## W = 1821, p-value = 0.6686
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(test981$value, test981$variable=="Fe_Ov_SF_av"), subset(test981$value, test981$variable=="Fe_mean"), alternative = c("l"))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(test981$value, test981$variable == "Fe_Ov_SF_av") and subset(test981$value, test981$variable == "Fe_mean")
## W = 1789, p-value = 0.604
## alternative hypothesis: true location shift is less than 0

Select Ago2 targets for ovary-expressed esiRNAs

input files “all_putative_known_sRNA_seeds2.txt” (generated in mirdeep2_processing_filtering script), “Table_S4.txt” and “all_Ago2_known_novel_fam_rn.fa” (generated in smallRNA_abundance_scatterplots script) are available in my Github

#get 3'UTRs and Ago2-supported peaks
all_merge_aegypti_Ago2 <- subset(all_with_pats, all_with_pats$aegypti_Ago2_BCsample > 0)
aegypti_Ago2_3UTR <- all_merge_aegypti_Ago2[grepl( "3' UTR", all_merge_aegypti_Ago2$annotation),]

#perfect targets (all, doesn't need to be 3'UTR)
aegypti_perf <- all_merge_aegypti_Ago2[grepl("novel|aae", all_merge_aegypti_Ago2$Pperfect_18mer_target),]
aegypti_perf <- aegypti_perf[grepl("common-AGO2|aegypti-AGO2", aegypti_perf$Pperfect_18mer_target),]

#chimera 3'UTR targets
aegypti_chim_Ago2 <- aegypti_Ago2_3UTR[grepl("AGO2_both|AGO2_aegypti", aegypti_Ago2_3UTR$top_smallRNA_chimera),]
aegypti_chim_Ago2 <- aegypti_chim_Ago2[!aegypti_chim_Ago2$peakID %in% aegypti_perf$peakID,]

#6mer 3'UTR targets
aegypti_6mer_Ago2 <- aegypti_Ago2_3UTR[grepl("AGO2_both|AGO2_aegypti", aegypti_Ago2_3UTR$Ptop_smallRNA),]
aegypti_6mer_Ago2_no_chim  <- aegypti_6mer_Ago2[!aegypti_6mer_Ago2$peakID %in% aegypti_perf$peakID,]
aegypti_6mer_Ago2_no_chim  <- aegypti_6mer_Ago2_no_chim [!aegypti_6mer_Ago2_no_chim $peakID %in% aegypti_chim_Ago2$peakID,]

#8mer targets
seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE) #913 unique seed combos
seeds_df <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Mol_cell_resub/Revision_files/Supplementary_Tables/Table_S4.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)

#link to small RNA names
seed_table <- merge(seeds_df[,c(1:2, 38)], seed_table,by.x = "smallRNA_sequence" , by.y = "FL", all.x= TRUE )

#select top most abundant/targeting esiRNAs
Ago2_sRNAs <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/all_Ago2_known_novel_fam_rn.fa", format = "fasta")
Ago2_sRNAs_aegypti <- Ago2_sRNAs[grepl("common-Ago2|aegypti-Ago2|aae-miR", names(Ago2_sRNAs))]

seed_table$filt <- seed_table$six_mer_target %in% Ago2_sRNAs_aegypti
Ago2_top_seeds <- subset(seed_table, seed_table$filt==TRUE)

#make vector of top Ago2 esiRNA names to search in 8mer column
vec8 <- as.character(Ago2_top_seeds$smallRNA)
vec8 <- gsub("AGO1", "Ago1", vec8)
vec8 <- gsub("AGO2", "Ago2", vec8)

row.names(aegypti_6mer_Ago2_no_chim) <- 1:nrow(aegypti_6mer_Ago2_no_chim)

matmatch8 <- vector("list", length(vec8))
for (i in 1:length(vec8)) {
  matmatch8[[i]] <- aegypti_6mer_Ago2_no_chim[grepl(vec8[i], aegypti_6mer_Ago2_no_chim$Peight_target),]
  matmatch8[[i]] <- matmatch8[[i]]$peakID
}

matmatch8 <- unique(unlist(matmatch8))
aegypti_Ago2_8 <- aegypti_6mer_Ago2_no_chim[aegypti_6mer_Ago2_no_chim$peakID %in% matmatch8,]
#have already removed perfect and chimera

##alt 8 
matmatch8alt <- vector("list", length(vec8))
for (i in 1:length(vec8)) {
  matmatch8alt[[i]] <- aegypti_6mer_Ago2_no_chim[grepl(vec8[i], aegypti_6mer_Ago2_no_chim$Peight_target_alt),]
  matmatch8alt[[i]] <- matmatch8alt[[i]]$peakID
}

matmatch8alt <- unique(unlist(matmatch8alt))
aegypti_Ago2_8alt <- aegypti_6mer_Ago2_no_chim[aegypti_6mer_Ago2_no_chim$peakID %in% matmatch8alt,]

#remove 8mers from 6mer no chim
aegypti_6mer_Ago2_no_chim_no8 <- aegypti_6mer_Ago2_no_chim[!aegypti_6mer_Ago2_no_chim$peakID %in% aegypti_Ago2_8alt$peakID,]
aegypti_6mer_Ago2_no_chim_no8 <- aegypti_6mer_Ago2_no_chim_no8[!aegypti_6mer_Ago2_no_chim_no8$peakID %in% aegypti_Ago2_8$peakID,]


#get non_3UTR 
aegypti_Ago2_non3UTR <- all_merge_aegypti_Ago2[!grepl( "3' UTR", all_merge_aegypti_Ago2$annotation),]
#is mutually exclusive with 6mers because they have to be in 3UTR, same for chimera
aegypti_Ago2_non3UTR <- aegypti_Ago2_non3UTR[!aegypti_Ago2_non3UTR$peakID %in% aegypti_perf$peakID,]
aegypti_Ago2_non3UTRtop <- aegypti_Ago2_non3UTR[grepl("AGO2_both|AGO2_aegypti", aegypti_Ago2_non3UTR$Ptop_smallRNA) ,]

tosel <- c("aegypti_Ago2_non3UTRtop", "aegypti_Ago2_8alt", "aegypti_Ago2_8", "aegypti_chim_Ago2", "aegypti_perf", "aegypti_6mer_Ago2_no_chim_no8" )


h<- do.call("list",mget(tosel))
h <- lapply(h, function(x) x[,c("geneId", "transcriptId")])
hu <- lapply(h, unique)
names(hu) <- names(h)


#for(i in seq_along(hu)) {
#  write.table(hu[i], paste0("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago2_lists_FINAL/", names(hu)[i], ".txt"), 
#              col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
#}

Ago2 esiRNA target gene lists are available in my Github in the folder “Ago2_lists_FINAL”

GSVA for Ago2, Figure S7D

myAgo2List <- dir("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago2_lists_FINAL/",full.names = TRUE) %>% lapply(function(x)read.delim(x,header=TRUE)[,1] %>% as.vector)

#map to their geneIDs
Ago2setList <- lapply(myAgo2List,function(x) unique(annotation[annotation[,2] %in% x,1]) %>% na.omit %>% as.vector)
names(Ago2setList) <- dir("/Users/kathryn/Reprocess_all_paper_datasets/RNAseq_test/Ago2_lists_FINAL/") %>% gsub("\\.txt","",.)

removetarg <- unlist(Ago2setList)
names(removetarg) <- NULL
nottarg <- row.names(myNewCounts2)[!row.names(myNewCounts2) %in% removetarg]
Ago2setList$nontarget <- nottarg 

Ago2setList$all8 <- unique(c(Ago2setList$aegypti_Ago2_8alt, Ago2setList$aegypti_Ago2_8))

mhCountsAgo2 <- gsva(myCounts2[,colnames(myCounts2) %in% Fe],gset.idx.list = Ago2setList[!grepl("aegypti_Ago2_8alt|aegypti_Ago2_8", names(Ago2setList))],kcdf=c("Poisson"))

## Warning in .local(expr, gset.idx.list, ...): 317 genes with constant
## expression values throuhgout the samples.

## Warning in .local(expr, gset.idx.list, ...): Since argument method!
## ="ssgsea", genes with constant expression values are discarded.

## Estimating GSVA scores for 6 gene sets.
## Computing observed enrichment scores
## Estimating ECDFs with Poisson kernels
## Using parallel with 12 cores
## 
  |                                                                       
  |                                                                 |   0%

#visualize with heatmap
colors <- metaData2
colors <- as.data.frame(unique(colors$Group))
colors <- setNames(colors, c("Group"))
colors$color <- ifelse(grepl("Ov_O", colors$Group), paste0("lightslateblue"), paste0(colors$color))
colors$color <- ifelse(grepl("Ov_SF", colors$Group), paste0("plum"), paste0(colors$color))
mycolors <- as.character(colors$color) 
names(mycolors) <- colors$Group
mycolors <- list(Group = mycolors)
mycolors$Paired <- c("gray50", "black")
names(mycolors$Paired) <- c("Paired", "Single")

paletteLength <- 50
palAgo2 <- colorRampPalette(c("darkred", "white", "darkorange"))(paletteLength)
myBreaks <- c(seq(min(mhCountsAgo2), 0, length.out=ceiling(paletteLength/2) + 1), seq(max(mhCountsAgo2)/paletteLength, max(mhCountsAgo2), length.out=floor(paletteLength/2)))


pheatmap:::pheatmap(mhCountsAgo2,
                    annotation_col = metaData2,
                     color = palAgo2, breaks = myBreaks,
                    cluster_cols = TRUE,cluster_rows = TRUE, scale="none", fontsize_col = 6, annotation_colors = mycolors, treeheight_row = 20, fontsize = 6, cellheight = 25, cellwidth = 6)

Ago2 eCDF, Figures 7E and 7F

#calculate log2FC of all tissues versus ovary, because see expression in the ovary of esiRNAs 
myCountstest$gene <- row.names(myCountstest)
perf_cdf <- myCountstest[row.names(myCountstest) %in% Ago2setList$aegypti_perf,]
perf_cdf<- perf_cdf[grepl("av|mean|gene", colnames(perf_cdf))]
perf_cdf$log2perf_Ov_SF <- log2(perf_cdf$Fe_Ov_SF_av) - log2(perf_cdf$Fe_mean)
perf_cdf$log2perf_Ov_O <- log2(perf_cdf$Fe_Ov_O_av) - log2(perf_cdf$Fe_mean)
perf_cdf <- melt(perf_cdf, id.vars = c("gene"), measure.vars = c("log2perf_Ov_SF", "log2perf_Ov_O"))

six_cdf <- myCountstest[row.names(myCountstest) %in% Ago2setList$aegypti_6mer_Ago2_no_chim_no8,]
six_cdf <- six_cdf[grepl("av|mean|gene", colnames(six_cdf))]
six_cdf$log2six_Ov_SF <- log2(six_cdf$Fe_Ov_SF_av) - log2(six_cdf$Fe_mean)
six_cdf$log2six_Ov_O <- log2(six_cdf$Fe_Ov_O_av) - log2(six_cdf$Fe_mean)
six_cdf <- melt(six_cdf, id.vars = c("gene"), measure.vars = c("log2six_Ov_SF", "log2six_Ov_O"))

eight_cdf <- myCountstest[row.names(myCountstest) %in% Ago2setList$all8,]
eight_cdf <- eight_cdf[grepl("av|mean|gene", colnames(eight_cdf))]
eight_cdf$log2eight_Ov_SF <- log2(eight_cdf$Fe_Ov_SF_av) - log2(eight_cdf$Fe_mean)
eight_cdf$log2eight_Ov_O <- log2(eight_cdf$Fe_Ov_O_av) - log2(eight_cdf$Fe_mean)
eight_cdf <- melt(eight_cdf, id.vars = c("gene"), measure.vars = c("log2eight_Ov_SF", "log2eight_Ov_O"))

chim_cdf <- myCountstest[row.names(myCountstest) %in% Ago2setList$aegypti_chim_Ago2,]
chim_cdf <- chim_cdf[grepl("av|mean|gene", colnames(chim_cdf))]
chim_cdf$log2chim_Ov_SF <- log2(chim_cdf$Fe_Ov_SF_av) - log2(chim_cdf$Fe_mean)
chim_cdf$log2chim_Ov_O <- log2(chim_cdf$Fe_Ov_O_av) - log2(chim_cdf$Fe_mean)
chim_cdf <- melt(chim_cdf, id.vars = c("gene"), measure.vars = c("log2chim_Ov_SF", "log2chim_Ov_O"))

nonUTR_cdf <- myCountstest[row.names(myCountstest) %in% Ago2setList$aegypti_Ago2_non3UTRtop,]
nonUTR_cdf <- nonUTR_cdf[grepl("av|mean|gene", colnames(nonUTR_cdf))]
nonUTR_cdf$log2nonUTR_Ov_SF <- log2(nonUTR_cdf$Fe_Ov_SF_av) - log2(nonUTR_cdf$Fe_mean)
nonUTR_cdf$log2nonUTR_Ov_O <- log2(nonUTR_cdf$Fe_Ov_O_av) - log2(nonUTR_cdf$Fe_mean)
nonUTR_cdf <- melt(nonUTR_cdf, id.vars = c("gene"), measure.vars = c("log2nonUTR_Ov_SF", "log2nonUTR_Ov_O"))

Ago2_cdf <- rbind(nonUTR_cdf, six_cdf, perf_cdf, chim_cdf, eight_cdf )


ggplot(Ago2_cdf[grepl("Ov_O", Ago2_cdf$variable),], aes(value, colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("gray","lightpink", "red4", "red2", "hotpink")) + theme_bw()

## Warning: Removed 36 rows containing non-finite values (stat_ecdf).

ggplot(Ago2_cdf[grepl("Ov_SF", Ago2_cdf$variable),], aes(value, colour = variable)) + stat_ecdf(geom = "point", size=1) + scale_colour_manual(values=c("gray","lightpink", "red4", "red2", "hotpink" )) + theme_bw()

## Warning: Removed 32 rows containing non-finite values (stat_ecdf).

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2perf_Ov_SF"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_SF"),alternative = c("l")) #0.01023

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2perf_Ov_SF") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_SF")
## W = 8730, p-value = 0.01023
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2six_Ov_SF"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_SF"),alternative = c("l")) #0.1747

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2six_Ov_SF") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_SF")
## W = 355650, p-value = 0.1747
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2eight_Ov_SF"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_SF"),alternative = c("l")) #0.03799

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2eight_Ov_SF") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_SF")
## W = 91291, p-value = 0.03799
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2chim_Ov_SF"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_SF"),alternative = c("l")) #0.00934

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2chim_Ov_SF") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_SF")
## W = 19818, p-value = 0.00934
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2perf_Ov_O"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_O"),alternative = c("l")) #0.001305

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2perf_Ov_O") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_O")
## W = 7451, p-value = 0.001305
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2six_Ov_O"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_O"),alternative = c("l")) #0.2492

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2six_Ov_O") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_O")
## W = 357800, p-value = 0.2492
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2eight_Ov_O"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_O"),alternative = c("l")) # 0.1278

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2eight_Ov_O") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_O")
## W = 93730, p-value = 0.1278
## alternative hypothesis: true location shift is less than 0

wilcox.test(subset(Ago2_cdf$value,Ago2_cdf$variable=="log2chim_Ov_O"), subset(Ago2_cdf$value, Ago2_cdf$variable=="log2nonUTR_Ov_O"),alternative = c("l")) #0.01105

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(Ago2_cdf$value, Ago2_cdf$variable == "log2chim_Ov_O") and subset(Ago2_cdf$value, Ago2_cdf$variable == "log2nonUTR_Ov_O")
## W = 19944, p-value = 0.01105
## alternative hypothesis: true location shift is less than 0