##Reprocessing steps of small RNA-seq from Akbari et al., 2013
small RNA-seq data reported in Akbari et al., 2013 were downloaded from ENA, accession: PRJNA612346
Filtered small RNA sequences to reverse map to Akbari small RNA-seq, “all_unique_filt_sRNAs_rn.fa”, were generated in the smallRNA_abundance_scatterplots_to_upload script and are available on my Github
Additionally, see CLIPflexR documentation for:
unzip
FASTX’s fastq_quality_filter s FASTX’s fastx_clipper
bowtie2_index
revmap_count
bamtobed
# Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/Akbari_check_esiRNAs"
# pathtogunzip <- dir(Dir, pattern= "*.fastq.gz", full.names = TRUE)
# gunzip <- function(fileToGunZip,gunzip="gunzip",
# stderr=paste0(getwd(),"gunzip_stderr"),
# stdout=paste0(getwd(),"gunzip_stdout")){
# cmd <- gunzip
# if(!file.exists(fileToGunZip))stop("File does not exist")
#
# fileWithoutExtension <- tools::file_path_sans_ext(fileToGunZip)
# if(file.exists(fileToGunZip) & !file.exists(fileWithoutExtension)){
# args <- c(fileWithoutExtension,"-k")
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(fileWithoutExtension)
# }
#
#
# for (file in pathtogunzip) {
# gunzip(file)
# }
#
#
# pathtoQC <- "/rugpfs/fs0/rice_lab/scratch/krozen/Akbari_check_esiRNAs"
# toqf <- dir(pathtoQC , pattern= "*.fastq$", full.names = TRUE)
#
# fastq_quality_filter <- function(fileTofqf,fqf="fastq_quality_filter",qEncoding=33,
# minimumQuality=20,
# minimumPercentOfRead=80,
# stderr=paste0(getwd(),"fastq_quality_filter_stderr"),
# stdout=paste0(getwd(),"fastq_quality_filter_stdout")){
#
#
# cmd <- fqf
#
# if(!file.exists(fileTofqf))stop("File does not exist")
#
# file_fqf <- file.path(dirname(fileTofqf),paste0("QF_",basename(fileTofqf)))
# if(file.exists(fileTofqf) & !file.exists(file_fqf)){
#
#
# args <- c(
# paste0("-Q ",qEncoding),
# paste0("-q ",minimumQuality),
# paste0("-p ",minimumPercentOfRead),
# paste0("-i ",fileTofqf),
# paste0("-o ",file_fqf)
# )
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(file_fqf)
# }
#
# #R command to quality filter:
# file_QF <- fastq_quality_filter(file,fqf = "/Users/kathryn/FastX/fastq_quality_filter")
# bplapply(toqf, fastq_quality_filter, fqf = "/rugpfs/fs0/home/krozen/bin/fastq_quality_filter")
#
#
# fastx_clipper <- function(fileTofqs,fqc="fastx_clipper",length=18,
# adaptor="CTGTAGGCACCATCAATC",
# stderr=file.path(getwd(),"clipper_stats_stderr"),
# stdout=file.path(getwd(),"clipper_stats_stdout")){
# cmd <- fqc
#
# if(!file.exists(fileTofqs))stop("File does not exist")
#
# file_fqs <- file.path(dirname(fileTofqs),gsub(".fastastq",".fa",basename(fileTofqs)))
# if(file.exists(fileTofqs) & !file.exists(file_fqs)){
#
# args <- c(
# paste0("-l ",length),
# paste0("-a ",adaptor),
# paste0("-o ",file_fqs),
# paste0("-i ",fileTofqs)
# )
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(file_fqs)
# }
#
# #R command to clip linker:
# pathtoQC <- "/rugpfs/fs0/rice_lab/scratch/krozen/Akbari_check_esiRNAs"
# toclip <- dir(pathtoQC , pattern= "*fastastq$", full.names = TRUE)
#
# bplapply(toclip, fastx_clipper, fqc = "/rugpfs/fs0/home/krozen/bin/fastx_clipper")
#
# #make indices for mapping
# pathtoQC <- "/rugpfs/fs0/rice_lab/scratch/krozen/Akbari_check_esiRNAs"
# unmapped <- dir(pathtoQC,pattern="*.fa$",full.names = TRUE)
# bowtie_index <- function(genomeFasta,
# outFasta=gsub("\\.fa","",genomeFasta)
# ) {
# require(Rbowtie2)
# if(!dir.exists(outFasta)){
# bowtie2_build(references=genomeFasta,
# bt2Index=outFasta)
# }
#
# }
#
# bplapply(unmapped, bowtie_index)
#
#
# #reverse map to all filtered known
# bowtie_align <- function(fq,index,sam=gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",index)
# ) {
# require(Rbowtie2)
# if(!dir.exists(sam)){
# bowtie2(bt2Index = index,
# samOutput = sam,
# seq1 = fq,"--threads 4 -f -L 18 -k 1000000")
# Rsamtools::asBam(sam,gsub("\\.sam","",sam))
# }
#
# }
#
# pathtoQC <- "/rugpfs/fs0/rice_lab/scratch/krozen/Akbari_check_esiRNAs"
# ref <- dir(pathtoQC , pattern= "*fa$", full.names = TRUE)
# ref <- gsub(".fa", unmapped)
# for (i in unmapped) {
# bowtie_align("/rugpfs/fs0/rice_lab/scratch/krozen/Akbari_check_esiRNAs/all_unique_filt_sRNAs_rn.fa", i)}
#
# #convert to BED:
#
# bamtobed <- function(file,filtDup=FALSE){
# require(GenomicAlignments)
# require(rtracklayer)
# temp <- readGAlignments(file,
# param=ScanBamParam(what = "qname"))
# names(temp) <- mcols(temp)$qname
# temp <- granges(temp)
#
# if(filtDup) temp <- temp[!duplicated(temp),]
# export.bed(temp,
# con = gsub("\\.bam",".bed",file))
# }
#
# #R command for bamtobed:
# pathtoQC <- "/Volumes/MyPassport/Akbari_check_esiRNAs/uncollapsed"
# mapped <- dir(pathtoQC , pattern= "bam$", full.names = TRUE)
#
# bplapply(mapped, bamtobed)
##Load all beds of reverse mapped smallRNA seq reads to filtered small RNAs and count
Dir <- "/Volumes/MyPassport/Akbari_check_esiRNAs/uncollapsed"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)
esi <- lapply(beds, read.delim, header = FALSE, sep = "")
col.names <- gsub("/Volumes/MyPassport/Akbari_check_esiRNAs/uncollapsed/all_unique_filt_sRNAs_rn_QF_", "", beds)
col.names <- gsub("_clip.bed", "", col.names)
names(esi) <- col.names
esi_count <- lapply(esi, function(x) x %>% group_by(V4) %>% summarize(count=n()))
esi_count <- Reduce(function(x, y) merge(x, y, by = "V4", all = TRUE), esi_count)
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result
## Warning in merge.data.frame(x, y, by = "V4", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result
esi_count[is.na(esi_count)] <- 0
col.names <- c("smallRNA", col.names)
esi_count <- setNames(esi_count, col.names)
#calculate percentages or rpm for each small RNA
esi_count_norm_log <- esi_count %>% mutate_at(vars(-smallRNA), funs(((.+1)/sum(.))*1E6))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
esi_count_norm_log$smallRNA <- gsub("Ago1", "AGO1", esi_count_norm_log$smallRNA)
esi_count_norm_log$smallRNA <- gsub("Ago2", "AGO2", esi_count_norm_log$smallRNA)
#rename
#available in my github; all filtered small RNAs, add family name so can collapse by family
getmatseq <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Mol_Cell_resub/Revision_files/Supplementary_Tables/Table_S4.txt", header =TRUE, sep = "\t")
esi_count_norm_log <- merge(esi_count_norm_log, getmatseq[,c("smallRNA", "aae_smallRNA_family")], by = "smallRNA")
#keep only Ago2 family names
t <- str_split(esi_count_norm_log$aae_smallRNA_family, "/")
b <- vector("list", length(t))
for(i in 1:length(t)){
b[[i]] <- grep("AGO2|aae", t[[i]], value = TRUE)
}
b <- lapply(b, function(x) if(identical(x, character(0))) NA_character_ else x)
esi_count_norm_log$Ago2nm <- unlist(lapply(b,function(x) paste0(x)))
#rename columns from run IDs to samples
#SraRun table is in my github and at: https://www.ncbi.nlm.nih.gov/Traces/study/?
#project accession: PRJNA612346
meta <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Akbari_check_esiRNAs/SraRunTable.txt", header = TRUE, sep = ",")
meta <- meta[, c("Sample.Name", "Run")]
search_for_these <- as.character(meta$Run)
replace_with_these <- as.character(meta$Sample.Name)
topAgo2log <- esi_count_norm_log
#topAgo2log <- setNames(topAgo2log, gsub("QF_", "", colnames(topAgo2log)))
found <- match(colnames(topAgo2log), search_for_these, nomatch = 0)
colnames(topAgo2log)[colnames(topAgo2log) %in% search_for_these] <- replace_with_these[found]
#write.table(topAgo2log, "/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/small_RNA_seq_counts_7D.txt", col.names = T, row.names = F, quote = F, sep = "\t")
#Beds are too large for upload to Github, but are available upon request An input table for graphing is available on my Github to load for use in the subsequent chunk
##Classify by small RNA type in ovary (blood-fed and nonblood-fed), embryos, and carcasses (all female) and graph expression in rpm for Figure 7D
#classify by type using fam column
topAgo2log <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/small_RNA_seq_counts_7D.txt", header = T, sep = "\t" )
topAgo2log$type <- ifelse(grepl("AGO2", topAgo2log$Ago2nm), paste0("top_novel"), NA)
topAgo2log$type <- ifelse(grepl("novel", topAgo2log$Ago2nm) & !grepl("top_novel", topAgo2log$type), paste0("novel"), topAgo2log$type)
topAgo2log$type <- ifelse(is.na(topAgo2log$type), paste0("known"), topAgo2log$type)
#keep only female tissues and make long form for graphing/dplyr calculations
tograb <- grep("low|high|Female", colnames(topAgo2log), value = TRUE)
topAgo2log <- melt(data = topAgo2log, id.vars = c("Ago2nm", "smallRNA", "type"), measure.vars = tograb)
#classify tissue
topAgo2log$tissue <- ifelse(grepl("Ovary", topAgo2log$variable), paste0("Ovary"), NA)
topAgo2log$tissue <- ifelse(grepl("Female_Carcass", topAgo2log$variable), paste0("Female_Carcass"), topAgo2log$tissue )
topAgo2log$tissue <- ifelse(grepl("Embryo", topAgo2log$variable), paste0("Embryo"), topAgo2log$tissue )
#classify sample groups because have two different size selections of the same library
topAgo2log$sample_group <- gsub(".low|.high", "", topAgo2log$variable)
topAgo2log$sample_group <- substring(topAgo2log$sample_group, 7)
#sum all small RNA expression values by family
topAgo2gg <- topAgo2log %>% group_by(.dots=c("variable", "Ago2nm")) %>% summarise(fam_ab = sum(value), tissue = dplyr::first(tissue), sample_group = dplyr::first(sample_group), type = dplyr::first(type))
## Warning: Factor `Ago2nm` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#take average between high and low size selection per same sample
topAgo2gg <- topAgo2gg %>% group_by(.dots=c("sample_group", "Ago2nm")) %>% summarise(mean_ab = mean(fam_ab), tissue = dplyr::first(tissue), type = dplyr::first(type))
## Warning: Factor `Ago2nm` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#select tissues of interest
topAgo2ggraph <- topAgo2gg[grepl(".72hr_PBM|NBF|.2.4hr", topAgo2gg$sample_group),]
#reorder factor levels so graph will display properly
topAgo2ggraph$sample_group <- factor(topAgo2ggraph$sample_group , levels=c(".72hr_PBM_Female_Carcass", ".NBF.Ovary", ".72hr_PBM.Ovary", ".2.4hr_Embryo"))
ggplot(topAgo2ggraph, aes(x=type, y= log2(mean_ab), fill=sample_group)) + geom_boxplot(outlier.shape = NA, width = 0.5, notch = TRUE) + scale_fill_manual(values=c("#CCE70B", "plum", "lightslateblue", "mediumaquamarine")) + theme_bw() + ylim(0, 18)
## Warning: Removed 48 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".2.4hr_Embryo" & topAgo2gg$type=="novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="novel" ),
alternative = c("t")) #p-value < 2.2e-16; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".2.4hr_Embryo" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "novel") and topAgo2gg$type == "novel")
## W = 9675, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".NBF.Ovary" & topAgo2gg$type=="novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="novel" ),
alternative = c("t")) #p-value < 2.2e-16; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".NBF.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "novel") and topAgo2gg$type == "novel")
## W = 9998, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM.Ovary" & topAgo2gg$type=="novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="novel" ),
alternative = c("t")) #p-value < 2.2e-16; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "novel") and topAgo2gg$type == "novel")
## W = 9697, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".2.4hr_Embryo" & topAgo2gg$type=="top_novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="top_novel" ),
alternative = c("t")) #p-value = 1.358e-05; is the embryo less than the carcass
##
## Wilcoxon rank sum test
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".2.4hr_Embryo" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "top_novel") and topAgo2gg$type == "top_novel")
## W = 417, p-value = 1.358e-05
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".NBF.Ovary" & topAgo2gg$type=="top_novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="top_novel" ),
alternative = c("t")) #p-value = 5.681e-05; is the embryo less than the carcass
## Warning in wilcox.test.default(subset(topAgo2gg$mean_ab,
## topAgo2gg$sample_group == : cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".NBF.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "top_novel") and topAgo2gg$type == "top_novel")
## W = 414, p-value = 5.681e-05
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM.Ovary" & topAgo2gg$type=="top_novel" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="top_novel" ),
alternative = c("t")) #p-value = 0.004298; is the embryo less than the carcass
## Warning in wilcox.test.default(subset(topAgo2gg$mean_ab,
## topAgo2gg$sample_group == : cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "top_novel") and topAgo2gg$type == "top_novel")
## W = 364, p-value = 0.004298
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".2.4hr_Embryo" & topAgo2gg$type=="known" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="known" ),
alternative = c("t")) #p-value = 0.1295; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".2.4hr_Embryo" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "known") and topAgo2gg$type == "known")
## W = 3439, p-value = 0.1295
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".NBF.Ovary" & topAgo2gg$type=="known" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="known" ),
alternative = c("t")) # p-value = 0.0008516; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".NBF.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "known") and topAgo2gg$type == "known")
## W = 2816, p-value = 0.0008516
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM.Ovary" & topAgo2gg$type=="known" ) , subset(topAgo2gg$mean_ab, topAgo2gg$sample_group==".72hr_PBM_Female_Carcass" & topAgo2gg$type=="known" ),
alternative = c("t")) #p-value = 2.722e-05; is the embryo less than the carcass
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM.Ovary" & and subset(topAgo2gg$mean_ab, topAgo2gg$sample_group == ".72hr_PBM_Female_Carcass" & topAgo2gg$type == "known") and topAgo2gg$type == "known")
## W = 2543, p-value = 2.722e-05
## alternative hypothesis: true location shift is not equal to 0