Ma et al., 2021; PRJNA610833
Miesen et al., 2016; PRJNA310830
Haac et all., 2015; PRJNA272825
NC_001564.2 Cell fusing agent virus, Galveston strain
Phasi Charoen-like virus genome, Aag2-Bristol strain, KU936057.1 L-segment, KU936056.1 M-segment, KU936055.1 S-segment
Culex Y virus genome, isolate P1-BS2010, JQ659254.1 A-segment, JQ659255.1 B-segment
Additionally, see CLIPflexR documentation for:
FASTX’s fastq_quality_filter
FASTX’s fastx_clipper
bowtie2_index
bowtie_align
bamtobed
# pathtoQC <- "/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2"
#
# toqf <- dir(pathtoQC , pattern= "*.fastq$", full.names = TRUE)
#
# fastq_quality_filter <- function(fileTofqf,fqf="fastq_quality_filter",qEncoding=33,
# minimumQuality=20,
# minimumPercentOfRead=80,
# stderr=paste0(getwd(),"fastq_quality_filter_stderr"),
# stdout=paste0(getwd(),"fastq_quality_filter_stdout")){
#
#
# cmd <- fqf
#
# if(!file.exists(fileTofqf))stop("File does not exist")
#
# file_fqf <- file.path(dirname(fileTofqf),paste0("QF_",basename(fileTofqf)))
# if(file.exists(fileTofqf) & !file.exists(file_fqf)){
#
#
# args <- c(
# paste0("-Q ",qEncoding),
# paste0("-q ",minimumQuality),
# paste0("-p ",minimumPercentOfRead),
# paste0("-i ",fileTofqf),
# paste0("-o ",file_fqf)
# )
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(file_fqf)
# }
#
# #R command to quality filter:
# bplapply(toqf, fastq_quality_filter, fqf = "/rugpfs/fs0/home/krozen/bin/fastq_quality_filter")
#
#
# ##
# fastx_qtoa <- function(fileTofqa,fqc="fastq_to_fasta", qEncoding=33,
# stderr=file.path(getwd(),"convert_stderr"),
# stdout=file.path(getwd(),"convert_stdout")){
# cmd <- fqc
#
# if(!file.exists(fileTofqa))stop("File does not exist")
#
# file_fqa <- file.path(dirname(fileTofqa),gsub(".fastq",".fa",basename(fileTofqa)))
# if(file.exists(fileTofqa) & !file.exists(file_fqa)){
#
# args <- c(
# paste0("-Q ",qEncoding),
# paste0("-o ",file_fqa),
# paste0("-i ",fileTofqa)
# )
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(file_fqa)
# }
#
# bplapply(toclip, fastx_qtoa, fqc = "/rugpfs/fs0/home/krozen/bin/fastq_to_fasta")
#
# #TruSeq 3' adaptor: TGGAATTCTCGGGTGCCAAGG, adelman, rij; Lau appears to be already clipped
# fastx_clipper <- function(fileTofqs,fqc="fastx_clipper",length=18,
# adaptor="TGGAATTCTCGGGTGCCAAGG",
# stderr=file.path(getwd(),"clipper_stats_stderr"),
# stdout=file.path(getwd(),"clipper_stats_stdout")){
# cmd <- fqc
#
# if(!file.exists(fileTofqs))stop("File does not exist")
#
# file_fqs <- file.path(dirname(fileTofqs),gsub(".fa","_clip.fa",basename(fileTofqs)))
# if(file.exists(fileTofqs) & !file.exists(file_fqs)){
#
# args <- c(
# paste0("-l ",length),
# paste0("-a ",adaptor),
# paste0("-o ",file_fqs),
# paste0("-i ",fileTofqs)
# )
# print(cmd)
# print(args)
#
# system2(cmd,
# args,
# stdout=stdout,
# stderr=stderr
# )
# }
# return(file_fqs)
# }
#
# #R command to clip linker:
# pathtoQC <- "/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2"
# toclip <- dir(pathtoQC , pattern= "QF*", full.names = TRUE)
# toclip <- grep("Lau", toclip, value = T, invert = T)
#
# bplapply(toclip, fastx_clipper, fqc = "/rugpfs/fs0/home/krozen/bin/fastx_clipper")
#
# fqfilles <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "*clip.fa", full.names = FALSE)
# fqfilles2 <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "QF_Lau*", full.names = FALSE)
# fqfilles2 <- grep(".fa$", fqfilles2, value = T)
# fqfilles <- c(fqfilles,fqfilles2)
#
# #map; indices were previously made for each virus reference fasta using CLIPflexR's bowtie2_index function
# fqfilles <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "*clip.fa", full.names = FALSE)
# index<- gsub(".fa","",list.files("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2/persistent_viruses",pattern=".fa$",full.names = TRUE))
#
# bplapply(index,function(x,fqfilles){
# library(Rbowtie2)
# fqFiles <- file.path("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2",fqfilles)
#
# for(l in 1:length(fqFiles)){
# fq <- fqFiles[l]
# sam=file.path("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2",
# paste0(gsub("\\.fq|\\.fastq|\\.rm|\\.fa","",basename(fq)), "_", gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",basename(x))))
# bowtie2(bt2Index = x,
# samOutput = sam,
# seq1 = fq,"--threads 4 -f -L 18 -N 1 ")
# Rsamtools::asBam(sam,gsub("\\.sam","",sam),overwrite = TRUE)
# }
# },fqfilles=fqfilles)
#uncollapsed, virus
bamFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)
countByFlagStat <- function(bamFile){
system(paste0("samtools flagstat ",bamFile," > ",paste0(bamFile,"_flagStats.txt")))
}
#bplapply(bamFiles,countByFlagStat)
FilesToRead <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*._flagStats.txt$",full.names=TRUE)
#virus
repName <- gsub(".*clip_|.bam_flagStats.txt","",basename(FilesToRead))
repName <- gsub(".*rep1_", "", repName)
repName <- gsub(".*rep2_", "", repName)
#Lau, rij, etc.
sampleName <- gsub(".*QF_|_clip_.*","",basename(FilesToRead))
sampleName <- gsub("(rep[0-9]).*", "\\1", sampleName)
readVirusMatrix <- data.frame(FilesToRead,repName,sampleName)
parseFlagstat2 <- function(fileToR){
flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
Total <- as.numeric(flagstats[1,1])
Mapped <- as.numeric(flagstats[5,1])
PercentMapped <- (Mapped/Total)*100
return(data.frame(PercentMapped=PercentMapped))
}
Samples <- unique(readVirusMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readVirusMatrix[readVirusMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}
#make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readVirusMatrix[,2])
rownames(fullRes) <- Samples
#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_percent_mapped_virus.txt", quote = F, row.names = T, sep = "\t")
#get mapped and total read numbers for AaegL5_mapped
bamFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/AaegL5_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)
#bplapply(bamFiles,countByFlagStat)
FilesToRead <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/AaegL5_mapped",pattern="*._flagStats.txt$",full.names=TRUE)
#AaegL5
repName <- rep("AaegL5_mapped", length(FilesToRead))
#Lau, rij, etc.
sampleName <- gsub(".*QF_|_clip_.*","",basename(FilesToRead))
sampleName <- gsub("(rep[0-9]).*", "\\1", sampleName)
readMosMatrix <- data.frame(FilesToRead,repName,sampleName)
Samples <- unique(readMosMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_percent_AaegL5.txt", quote = F, row.names = T, sep = "\t")
#get total numbers for bedgraphs:
parseRM <- function(fileToR){
flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
Mapped <- as.numeric(flagstats[5,1])
return(data.frame(TotalMapped=Mapped))
}
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseRM))
}
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
fullRes <- as.data.frame(fullRes)
fullRes$sample <- row.names(fullRes)
fullRes <- fullRes %>% mutate(norm_factor = 1E6/AaegL5_mapped)
#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_total_AaegL5.txt", quote = F, row.names = T, sep = "\t")
bedFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*.bed$",full.names=TRUE)
CFAVbedFiles <- grep("CFAV.bed", bedFiles, value = T)
CFAVbedFiles <- grep("Lau_Aag2_rep2", CFAVbedFiles, value = T, invert = T)
CFAVbed <- lapply(CFAVbedFiles, read.delim, header = FALSE, sep = "\t")
CFAVbed <- do.call("rbind", CFAVbed)
#write.table(CFAVbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CFAV.bed", quote = F, col.names = F, sep = "\t", row.names = F)
PCLVLbedFiles <- grep("PCLV_L.bed", bedFiles, value = T)
PCLVLbedFiles <- grep("Lau_Aag2_rep2", PCLVLbedFiles, value = T, invert = T)
PCLVLbed <- lapply(PCLVLbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVLbed <- do.call("rbind", PCLVLbed)
#write.table(PCLVLbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_L.bed", quote = F, col.names = F, sep = "\t", row.names = F)
PCLVMbedFiles <- grep("PCLV_M.bed", bedFiles, value = T)
PCLVMbedFiles <- grep("Lau_Aag2_rep2", PCLVMbedFiles, value = T, invert = T)
PCLVMbed <- lapply(PCLVMbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVMbed <- do.call("rbind", PCLVMbed)
#write.table(PCLVMbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_M.bed", quote = F, col.names = F, sep = "\t", row.names = F)
PCLVSbedFiles <- grep("PCLV_S.bed", bedFiles, value = T)
PCLVSbedFiles <- grep("Lau_Aag2_rep2", PCLVSbedFiles, value = T, invert = T)
PCLVSbed <- lapply(PCLVSbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVSbed <- do.call("rbind", PCLVSbed)
#write.table(PCLVSbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_S.bed", quote = F, col.names = F, sep = "\t", row.names = F)
CLYAbedFiles <- grep("CLY_A.bed", bedFiles, value = T)
CLYAbedFiles <- grep("Lau_Aag2_rep2", CLYAbedFiles, value = T, invert = T)
CLYAbedFiles <- grep("Adelman", CLYAbedFiles, value = T, invert = T)
CLYAbed <- lapply(CLYAbedFiles, read.delim, header = FALSE, sep = "\t")
CLYAbed <- do.call("rbind", CLYAbed)
#write.table(CLYAbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CLY_a.bed", quote = F, col.names = F, sep = "\t", row.names = F)
CLYBbedFiles <- grep("CLY_B.bed", bedFiles, value = T)
CLYBbedFiles <- grep("Lau_Aag2_rep2", CLYBbedFiles, value = T, invert = T)
CLYBbedFiles <- grep("Adelman", CLYBbedFiles, value = T, invert = T)
CLYBbed <- lapply(CLYBbedFiles, read.delim, header = FALSE, sep = "\t")
CLYBbed <- do.call("rbind", CLYBbed)
#write.table(CLYBbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CLY_B.bed", quote = F, col.names = F, sep = "\t", row.names = F)
stat <- read.delim("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_total_AaegL5.txt", header = T, sep = "\t")
stat <- stat[-5,]
sum(stat$AaegL5_mapped)
## [1] 100472450
1000000/sum(stat$AaegL5_mapped)
## [1] 0.009952977
#0.009952977
Dir <- "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped"
mapped <- dir(Dir, pattern = ".bam$", full.names = T)
#bplapply(mapped, bamtobed)
see mirdeep2_processing_filtering_to_upload script in my Github for CLIP processing details; short uncollapsed CLIP reads with linker artifacts removed were mapped to viruses as above, for small RNA-seq
bamFiles <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*\\.bam$",full.names=TRUE)
#bplapply(bamFiles,countByFlagStat)
FilesToRead <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._flagStats.txt$",full.names=TRUE)
Aaegl5_files <- grep("AaegL5", FilesToRead, value = T)
virus_files <- grep("AaegL5", FilesToRead, value = T, invert = T)
bamvirus <- grep("AaegL5", bamFiles, value = T, invert = T)
#bplapply(bamvirus, bamtobed)
#virus
repName <- gsub(".*_short_rmlinker_|.bam_flagStats.txt", "", virus_files)
#mysamples
sampleName <-gsub(".*viruses_uncoll/|_short.*", "", virus_files)
readVirusMatrix <- data.frame(virus_files,repName,sampleName)
Samples <- unique(readVirusMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readVirusMatrix[readVirusMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}
#make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readVirusMatrix[,2])
rownames(fullRes) <- Samples
resdf <- as.data.frame(t(fullRes))
aegyptiAgo1 <- grep("aegypti_Ago1", Samples, value = TRUE)
aegyptiAgo2 <- grep("aegypti_Ago2", Samples, value = TRUE)
aegyptirIgG <- grep("aegypti_rIgG", Samples, value = TRUE)
aegyptimIgG <- grep("aegypti_mIgG", Samples, value = TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", Samples, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", Samples, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", Samples, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", Samples, value = TRUE)
resdf$Aag2_Ago1 <- rowMeans(resdf[Aag2Ago1])
resdf$Aag2_rIgG <- rowMeans(resdf[Aag2rIgG])
resdf$Aag2_Ago2 <- rowMeans(resdf[Aag2Ago2])
resdf$Aag2_mIgG <- rowMeans(resdf[Aag2mIgG])
resdf$aegypti_Ago1 <- rowMeans(resdf[aegyptiAgo1])
resdf$aegypti_rIgG <- rowMeans(resdf[aegyptirIgG])
resdf$aegypti_Ago2 <- rowMeans(resdf[aegyptiAgo2])
resdf$aegypti_mIgG <- rowMeans(resdf[aegyptimIgG])
# write.table(resdf, "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_mapped_virus.txt", quote = F, row.names = T, sep = "\t")
CLIPreads <- read.delim("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_mapped_virus.txt", header = T, sep = "\t")
CLIPreads <- t(CLIPreads)
#AaegL5
repName <- rep("AaegL5_mapped", length(Aaegl5_files))
sampleName <-gsub(".*viruses_uncoll/|_short.*", "", Aaegl5_files)
readMosMatrix <- data.frame(Aaegl5_files,repName,sampleName)
Samples <- unique(readMosMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
# write.table(as.data.frame(fullRes), "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_AaegL5.txt", quote = F, row.names = T, sep = "\t")
#total numbers
myRes <- list()
for(i in 1:length(Samples)){
myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseRM))
}
fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
fullRes <- as.data.frame(fullRes)
fullRes$group <- ifelse(row.names(fullRes) %in% Aag2Ago1, paste0("Aag2_Ago1"), NA )
fullRes$group <- ifelse(row.names(fullRes) %in% Aag2rIgG, paste0("Aag2_rIgG"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% Aag2Ago2, paste0("Aag2_Ago2"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% Aag2mIgG, paste0("Aag2_mIgG"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% aegyptiAgo1, paste0("aegypti_Ago1"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% aegyptirIgG, paste0("aegypti_rIgG"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% aegyptiAgo2, paste0("aegypti_Ago2"), paste0(fullRes$group))
fullRes$group <- ifelse(row.names(fullRes) %in% aegyptimIgG, paste0("aegypti_mIgG"), paste0(fullRes$group))
summ <- fullRes %>% dplyr::group_by(group) %>% summarise(AaegL5_total = sum(AaegL5_mapped))
summ <- summ %>% mutate(norm_factor = 1E6/AaegL5_total)
#write.table(summ, "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_total_AaegL5.txt", quote = F, row.names = T, sep = "\t")
CFAV_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CFAV.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CFAV_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CFAV_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CFAV_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CFAV_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
# h <-grep("cat",names(.GlobalEnv),value=TRUE)
# l <- do.call("list",mget(h))
#
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CFAV.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
PCLVL_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_L.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVL_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVL_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVL_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVL_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLV_L.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
PCLVS_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_S.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVS_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVS_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVS_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVS_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLV_S.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
PCLVM_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_M.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVM_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVM_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVM_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVM_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLVM.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
CLYA_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CLY_A.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CLYA_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CLYA_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CLYA_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CLYA_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CLYA.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
CLYB_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CLY_B.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CLYB_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CLYB_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CLYB_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CLYB_beds, value = TRUE)
Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")
cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)
h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))
# for(i in seq_along(l)) {
# write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CLYB.bed"),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }
cat_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="cat*",full.names=TRUE)
cat_beds <- grep("*.bed$", cat_beds, value = T)
cat_count <- lapply(cat_beds, import, format = "BED")
names(cat_count) <- gsub("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/cat_", "", cat_beds)
cat_width <-lapply(cat_count, function(x) data.frame(width = width(x), strand = x@strand))
cat_width <- rbindlist(cat_width, idcol = TRUE)
cat_width <- setNames(cat_width, c("sample", "width", "strand"))
sum_df <- cat_width %>% group_by(.dots=c("sample","width", "strand")) %>% mutate(count = n()) %>% summarise_all(dplyr::first)
sum_df_short <- sum_df %>% filter(width < 25)
perc_df_short<- sum_df_short %>% group_by(sample) %>% mutate(nrow = sum(count), percent=count/nrow)
perc_df_short$percent <- ifelse(perc_df_short$strand=="-", perc_df_short$percent* -1, perc_df_short$percent*1)
cat_beds <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="cat*",full.names=TRUE)
cat_beds <- grep("*.bed$", cat_beds, value = T)
cat_count <- lapply(cat_beds, import, format = "BED")
names(cat_count) <- gsub("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_", "", cat_beds)
cat_width <-lapply(cat_count, function(x) data.frame(width = width(x), strand = x@strand))
cat_width <- rbindlist(cat_width, idcol = TRUE)
cat_width <- setNames(cat_width, c("sample", "width", "strand"))
sum_df <- cat_width %>% group_by(.dots=c("sample","width", "strand")) %>% mutate(count = n()) %>% summarise_all(dplyr::first)
sum_df_short <- sum_df %>% filter(width < 25)
perc_df_short_sRNA <- sum_df_short %>% group_by(sample) %>% mutate(nrow = sum(count), percent=count/nrow)
perc_df_short_sRNA$percent <- ifelse(perc_df_short_sRNA$strand=="-", perc_df_short_sRNA$percent* -1, perc_df_short_sRNA$percent*1)
Input file “persistent_viruses_CLIP_smallRNA_seq.txt” is available in my Github
perc_df_short <- perc_df_short[!grepl("IgG", perc_df_short$sample),]
all_uncoll_short <- rbind(perc_df_short_sRNA, perc_df_short)
#write.table(all_uncoll_short, "/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/persistent_viruses_CLIP_smallRNA_seq.txt", col.names = T, row.names = F, quote = F, sep = "\t")
all_uncoll_short <- read.table("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/persistent_viruses_CLIP_smallRNA_seq.txt", header = T, sep = "\t")
ggplot(all_uncoll_short[grepl("CFAV", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).
ggplot(all_uncoll_short[grepl("PCLV_L", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).
ggplot(all_uncoll_short[grepl("PCLVM|PCLV_M", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).
ggplot(all_uncoll_short[grepl("PCLV_S", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).
ggplot(all_uncoll_short[grepl("CLY_a|CLYA", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).
ggplot(all_uncoll_short[grepl("CLYB|CLY_B", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) + scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black", "black")) + theme_bw() + xlim(17,25)
## Warning: Removed 4 rows containing missing values (geom_bar).