reprocess_Aag2_small_RNA

Reprocess small RNA-seq from:

Ma et al., 2021; PRJNA610833
Miesen et al., 2016; PRJNA310830
Haac et all., 2015; PRJNA272825

Processed reads were mapped to:

NC_001564.2 Cell fusing agent virus, Galveston strain
Phasi Charoen-like virus genome, Aag2-Bristol strain, KU936057.1 L-segment, KU936056.1 M-segment, KU936055.1 S-segment
Culex Y virus genome, isolate P1-BS2010, JQ659254.1 A-segment, JQ659255.1 B-segment

Additionally, see CLIPflexR documentation for:
FASTX’s fastq_quality_filter
FASTX’s fastx_clipper
bowtie2_index
bowtie_align
bamtobed

# pathtoQC <- "/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2" 
# 
#  toqf <- dir(pathtoQC , pattern= "*.fastq$", full.names = TRUE) 
# 
#  fastq_quality_filter <- function(fileTofqf,fqf="fastq_quality_filter",qEncoding=33, 
#                                   minimumQuality=20, 
#                                   minimumPercentOfRead=80,                                  
#                                   stderr=paste0(getwd(),"fastq_quality_filter_stderr"), 
#                                   stdout=paste0(getwd(),"fastq_quality_filter_stdout")){ 
# 
# 
#    cmd <- fqf 
# 
#    if(!file.exists(fileTofqf))stop("File does not exist") 
# 
#    file_fqf <- file.path(dirname(fileTofqf),paste0("QF_",basename(fileTofqf))) 
#    if(file.exists(fileTofqf) & !file.exists(file_fqf)){ 
# 
# 
#      args <- c( 
#        paste0("-Q ",qEncoding), 
#        paste0("-q ",minimumQuality), 
#        paste0("-p ",minimumPercentOfRead), 
#        paste0("-i  ",fileTofqf), 
#        paste0("-o ",file_fqf)       
#      ) 
#      print(cmd) 
#      print(args) 
# 
#      system2(cmd, 
#              args, 
#              stdout=stdout, 
#              stderr=stderr 
#      ) 
#    } 
#    return(file_fqf) 
#  } 
# 
#  #R command to quality filter: 
#  bplapply(toqf,  fastq_quality_filter, fqf = "/rugpfs/fs0/home/krozen/bin/fastq_quality_filter")
#  
#  
#  ##
#  fastx_qtoa <- function(fileTofqa,fqc="fastq_to_fasta", qEncoding=33,
#                           stderr=file.path(getwd(),"convert_stderr"), 
#                           stdout=file.path(getwd(),"convert_stdout")){ 
#   cmd <- fqc 
#   
#   if(!file.exists(fileTofqa))stop("File does not exist") 
#   
#   file_fqa <- file.path(dirname(fileTofqa),gsub(".fastq",".fa",basename(fileTofqa))) 
#   if(file.exists(fileTofqa) & !file.exists(file_fqa)){ 
#     
#     args <- c(
#       paste0("-Q ",qEncoding), 
#       paste0("-o ",file_fqa), 
#       paste0("-i ",fileTofqa) 
#     ) 
#     print(cmd) 
#     print(args) 
#     
#     system2(cmd, 
#             args, 
#             stdout=stdout, 
#             stderr=stderr 
#     ) 
#   } 
#   return(file_fqa) 
#  } 
#  
# bplapply(toclip, fastx_qtoa, fqc = "/rugpfs/fs0/home/krozen/bin/fastq_to_fasta") 
# 
# #TruSeq 3' adaptor: TGGAATTCTCGGGTGCCAAGG, adelman, rij; Lau appears to be already clipped
# fastx_clipper <- function(fileTofqs,fqc="fastx_clipper",length=18, 
#                            adaptor="TGGAATTCTCGGGTGCCAAGG", 
#                            stderr=file.path(getwd(),"clipper_stats_stderr"), 
#                            stdout=file.path(getwd(),"clipper_stats_stdout")){ 
#    cmd <- fqc 
# 
#    if(!file.exists(fileTofqs))stop("File does not exist") 
# 
#    file_fqs <- file.path(dirname(fileTofqs),gsub(".fa","_clip.fa",basename(fileTofqs))) 
#    if(file.exists(fileTofqs) & !file.exists(file_fqs)){ 
# 
#      args <- c( 
#        paste0("-l ",length), 
#        paste0("-a  ",adaptor), 
#        paste0("-o ",file_fqs), 
#        paste0("-i ",fileTofqs) 
#      ) 
#      print(cmd) 
#      print(args) 
# 
#      system2(cmd, 
#              args, 
#              stdout=stdout, 
#              stderr=stderr 
#      ) 
#    } 
#    return(file_fqs) 
#  } 
# 
#  #R command to clip linker: 
#  pathtoQC <- "/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2" 
#  toclip <- dir(pathtoQC , pattern= "QF*", full.names = TRUE) 
#  toclip <- grep("Lau", toclip, value = T, invert = T)
# 
#  bplapply(toclip, fastx_clipper, fqc = "/rugpfs/fs0/home/krozen/bin/fastx_clipper") 
#  
#  fqfilles <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "*clip.fa", full.names = FALSE)
#  fqfilles2 <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "QF_Lau*", full.names = FALSE)
#  fqfilles2 <- grep(".fa$", fqfilles2, value = T)
#  fqfilles <- c(fqfilles,fqfilles2) 
#  
# #map; indices were previously made for each virus reference fasta using CLIPflexR's bowtie2_index function
# fqfilles <- dir("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2", pattern= "*clip.fa", full.names = FALSE)
# index<- gsub(".fa","",list.files("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2/persistent_viruses",pattern=".fa$",full.names = TRUE))
# 
# bplapply(index,function(x,fqfilles){
#   library(Rbowtie2)
#   fqFiles <- file.path("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2",fqfilles)
# 
#   for(l in 1:length(fqFiles)){
#     fq <- fqFiles[l]
#     sam=file.path("/rugpfs/fs0/rice_agoclip/scratch/krozen/sRNA_seq_Aag2",
#                   paste0(gsub("\\.fq|\\.fastq|\\.rm|\\.fa","",basename(fq)), "_", gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",basename(x))))
#     bowtie2(bt2Index = x,
#             samOutput = sam,
#             seq1 = fq,"--threads 4 -f -L 18 -N 1 ")
#     Rsamtools::asBam(sam,gsub("\\.sam","",sam),overwrite = TRUE)
#   }
# },fqfilles=fqfilles)

Get read stats for virus- and AaegL5-mapped from small RNA-seq

#uncollapsed, virus 
bamFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)

countByFlagStat <- function(bamFile){
  system(paste0("samtools  flagstat ",bamFile," > ",paste0(bamFile,"_flagStats.txt")))
}
#bplapply(bamFiles,countByFlagStat)

FilesToRead <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*._flagStats.txt$",full.names=TRUE)

#virus
repName <- gsub(".*clip_|.bam_flagStats.txt","",basename(FilesToRead))
repName <- gsub(".*rep1_", "", repName)
repName <- gsub(".*rep2_", "", repName)
#Lau, rij, etc.
sampleName <- gsub(".*QF_|_clip_.*","",basename(FilesToRead))
sampleName <- gsub("(rep[0-9]).*", "\\1", sampleName)

readVirusMatrix <- data.frame(FilesToRead,repName,sampleName)

parseFlagstat2 <- function(fileToR){
  flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
  Total <- as.numeric(flagstats[1,1])
  Mapped <- as.numeric(flagstats[5,1])
  PercentMapped <- (Mapped/Total)*100
  return(data.frame(PercentMapped=PercentMapped))
}
Samples <- unique(readVirusMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readVirusMatrix[readVirusMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}

#make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readVirusMatrix[,2])
rownames(fullRes) <- Samples

#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_percent_mapped_virus.txt", quote = F, row.names = T, sep = "\t")

#get mapped and total read numbers for AaegL5_mapped
bamFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/AaegL5_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)
#bplapply(bamFiles,countByFlagStat)

FilesToRead <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/AaegL5_mapped",pattern="*._flagStats.txt$",full.names=TRUE)

#AaegL5
repName <- rep("AaegL5_mapped", length(FilesToRead))

#Lau, rij, etc.
sampleName <- gsub(".*QF_|_clip_.*","",basename(FilesToRead))
sampleName <- gsub("(rep[0-9]).*", "\\1", sampleName)

readMosMatrix <- data.frame(FilesToRead,repName,sampleName)

Samples <- unique(readMosMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples

#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_percent_AaegL5.txt", quote = F, row.names = T, sep = "\t")

#get total numbers for bedgraphs:
parseRM <- function(fileToR){
  flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
  Mapped <- as.numeric(flagstats[5,1])
  return(data.frame(TotalMapped=Mapped))
}

myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseRM))
}

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
fullRes <- as.data.frame(fullRes)
fullRes$sample <- row.names(fullRes)
fullRes <- fullRes %>% mutate(norm_factor = 1E6/AaegL5_mapped)

#write.table(as.data.frame(fullRes), "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_total_AaegL5.txt", quote = F, row.names = T, sep = "\t")


bedFiles <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="*.bed$",full.names=TRUE)
CFAVbedFiles <- grep("CFAV.bed", bedFiles, value = T)
CFAVbedFiles <- grep("Lau_Aag2_rep2", CFAVbedFiles, value = T, invert = T)

CFAVbed <- lapply(CFAVbedFiles, read.delim, header = FALSE, sep = "\t")
CFAVbed  <- do.call("rbind", CFAVbed)
#write.table(CFAVbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CFAV.bed", quote = F, col.names = F, sep = "\t", row.names = F)

PCLVLbedFiles <- grep("PCLV_L.bed", bedFiles, value = T)
PCLVLbedFiles <- grep("Lau_Aag2_rep2", PCLVLbedFiles, value = T, invert = T)

PCLVLbed <- lapply(PCLVLbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVLbed  <- do.call("rbind", PCLVLbed)
#write.table(PCLVLbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_L.bed", quote = F, col.names = F, sep = "\t", row.names = F)

PCLVMbedFiles <- grep("PCLV_M.bed", bedFiles, value = T)
PCLVMbedFiles <- grep("Lau_Aag2_rep2", PCLVMbedFiles, value = T, invert = T)

PCLVMbed <- lapply(PCLVMbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVMbed  <- do.call("rbind", PCLVMbed)
#write.table(PCLVMbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_M.bed", quote = F, col.names = F, sep = "\t", row.names = F)

PCLVSbedFiles <- grep("PCLV_S.bed", bedFiles, value = T)
PCLVSbedFiles <- grep("Lau_Aag2_rep2", PCLVSbedFiles, value = T, invert = T)

PCLVSbed <- lapply(PCLVSbedFiles, read.delim, header = FALSE, sep = "\t")
PCLVSbed  <- do.call("rbind", PCLVSbed)
#write.table(PCLVSbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_PCLV_S.bed", quote = F, col.names = F, sep = "\t", row.names = F)

CLYAbedFiles <- grep("CLY_A.bed", bedFiles, value = T)
CLYAbedFiles <- grep("Lau_Aag2_rep2", CLYAbedFiles, value = T, invert = T)
CLYAbedFiles <- grep("Adelman", CLYAbedFiles, value = T, invert = T)

CLYAbed <- lapply(CLYAbedFiles, read.delim, header = FALSE, sep = "\t")
CLYAbed  <- do.call("rbind", CLYAbed)
#write.table(CLYAbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CLY_a.bed", quote = F, col.names = F, sep = "\t", row.names = F)

CLYBbedFiles <- grep("CLY_B.bed", bedFiles, value = T)
CLYBbedFiles <- grep("Lau_Aag2_rep2", CLYBbedFiles, value = T, invert = T)
CLYBbedFiles <- grep("Adelman", CLYBbedFiles, value = T, invert = T)

CLYBbed <- lapply(CLYBbedFiles, read.delim, header = FALSE, sep = "\t")
CLYBbed  <- do.call("rbind", CLYBbed)
#write.table(CLYBbed, "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_sRNAseq_CLY_B.bed", quote = F, col.names = F, sep = "\t", row.names = F)

stat <- read.delim("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/uncollapsed_total_AaegL5.txt", header = T, sep = "\t")
stat <- stat[-5,] 
sum(stat$AaegL5_mapped)

## [1] 100472450

1000000/sum(stat$AaegL5_mapped)

## [1] 0.009952977

 #0.009952977

Dir <- "/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped"
mapped <- dir(Dir, pattern = ".bam$", full.names = T)
#bplapply(mapped, bamtobed)

Take short uncollapsed CLIP virus-mapped reads to get stat

see mirdeep2_processing_filtering_to_upload script in my Github for CLIP processing details; short uncollapsed CLIP reads with linker artifacts removed were mapped to viruses as above, for small RNA-seq

bamFiles <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*\\.bam$",full.names=TRUE)
#bplapply(bamFiles,countByFlagStat)

FilesToRead <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._flagStats.txt$",full.names=TRUE)
Aaegl5_files <- grep("AaegL5",  FilesToRead, value = T)
virus_files <- grep("AaegL5",  FilesToRead, value = T, invert = T)

bamvirus <- grep("AaegL5",  bamFiles, value = T, invert = T)
#bplapply(bamvirus, bamtobed)

#virus
repName <- gsub(".*_short_rmlinker_|.bam_flagStats.txt", "", virus_files)

#mysamples
sampleName <-gsub(".*viruses_uncoll/|_short.*", "", virus_files)

readVirusMatrix <- data.frame(virus_files,repName,sampleName)

Samples <- unique(readVirusMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readVirusMatrix[readVirusMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}

#make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readVirusMatrix[,2])
rownames(fullRes) <- Samples
resdf <- as.data.frame(t(fullRes))

aegyptiAgo1 <-  grep("aegypti_Ago1", Samples, value = TRUE)
aegyptiAgo2 <- grep("aegypti_Ago2", Samples, value = TRUE)
aegyptirIgG <- grep("aegypti_rIgG", Samples, value = TRUE)
aegyptimIgG <- grep("aegypti_mIgG", Samples, value = TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", Samples, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", Samples, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", Samples, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", Samples, value = TRUE)

resdf$Aag2_Ago1 <- rowMeans(resdf[Aag2Ago1])
resdf$Aag2_rIgG <- rowMeans(resdf[Aag2rIgG])
resdf$Aag2_Ago2 <- rowMeans(resdf[Aag2Ago2])
resdf$Aag2_mIgG <- rowMeans(resdf[Aag2mIgG])
resdf$aegypti_Ago1 <- rowMeans(resdf[aegyptiAgo1])
resdf$aegypti_rIgG <- rowMeans(resdf[aegyptirIgG])
resdf$aegypti_Ago2 <- rowMeans(resdf[aegyptiAgo2])
resdf$aegypti_mIgG <- rowMeans(resdf[aegyptimIgG])

# write.table(resdf, "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_mapped_virus.txt", quote = F, row.names = T, sep = "\t")
CLIPreads <- read.delim("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_mapped_virus.txt", header = T,  sep = "\t")
CLIPreads <- t(CLIPreads)

#AaegL5
repName <- rep("AaegL5_mapped", length(Aaegl5_files))

sampleName <-gsub(".*viruses_uncoll/|_short.*", "", Aaegl5_files)

readMosMatrix <- data.frame(Aaegl5_files,repName,sampleName)

Samples <- unique(readMosMatrix[,3])
myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
}

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples

# write.table(as.data.frame(fullRes), "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_percent_AaegL5.txt", quote = F, row.names = T, sep = "\t")

#total numbers
myRes <- list()
for(i in 1:length(Samples)){
  myRes[[i]] <- unlist(sapply(as.vector(readMosMatrix[readMosMatrix[,3] %in% Samples[i],1]),parseRM))
}

fullRes <- do.call(rbind,myRes)
colnames(fullRes) <- unique(readMosMatrix[,2])
rownames(fullRes) <- Samples
fullRes <- as.data.frame(fullRes)

fullRes$group <- ifelse(row.names(fullRes) %in% Aag2Ago1, paste0("Aag2_Ago1"), NA )
fullRes$group  <- ifelse(row.names(fullRes) %in% Aag2rIgG, paste0("Aag2_rIgG"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% Aag2Ago2, paste0("Aag2_Ago2"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% Aag2mIgG, paste0("Aag2_mIgG"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% aegyptiAgo1, paste0("aegypti_Ago1"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% aegyptirIgG, paste0("aegypti_rIgG"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% aegyptiAgo2, paste0("aegypti_Ago2"), paste0(fullRes$group))
fullRes$group  <- ifelse(row.names(fullRes) %in% aegyptimIgG, paste0("aegypti_mIgG"), paste0(fullRes$group))

summ <- fullRes %>% dplyr::group_by(group) %>% summarise(AaegL5_total = sum(AaegL5_mapped))


summ <- summ %>% mutate(norm_factor = 1E6/AaegL5_total)

#write.table(summ, "/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/uncollapsed_total_AaegL5.txt", quote = F, row.names = T, sep = "\t")

Concatenate all CLIP beds for the same virus/sample group together

CFAV_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CFAV.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CFAV_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CFAV_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CFAV_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CFAV_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

# h <-grep("cat",names(.GlobalEnv),value=TRUE)
# l <- do.call("list",mget(h))
# 
# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CFAV.bed"), 
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

PCLVL_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_L.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVL_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVL_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVL_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVL_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))

# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLV_L.bed"),
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

PCLVS_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_S.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVS_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVS_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVS_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVS_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))

# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLV_S.bed"),
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

PCLVM_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*PCLV_M.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", PCLVM_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", PCLVM_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", PCLVM_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", PCLVM_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))

# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_PCLVM.bed"),
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

CLYA_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CLY_A.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CLYA_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CLYA_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CLYA_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CLYA_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))

# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CLYA.bed"),
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

CLYB_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="*._.*_.*CLY_B.bed$",full.names=TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", CLYB_beds, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", CLYB_beds, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", CLYB_beds, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", CLYB_beds, value = TRUE)

Aag2Ago1bed <- lapply(Aag2Ago1, read.delim, header = FALSE, sep = "\t")
Aag2Ago2bed <- lapply(Aag2Ago2, read.delim, header = FALSE, sep = "\t")
Aag2rIgGbed <- lapply(Aag2rIgG, read.delim, header = FALSE, sep = "\t")
Aag2mIgGbed <- lapply(Aag2mIgG, read.delim, header = FALSE, sep = "\t")

cat_Aag2Ago1 <- do.call("rbind", Aag2Ago1bed)
cat_Aag2Ago2 <- do.call("rbind", Aag2Ago2bed)
cat_Aag2rIgG <- do.call("rbind", Aag2rIgGbed)
cat_Aag2mIgG <- do.call("rbind", Aag2mIgGbed)

h <-grep("cat",names(.GlobalEnv),value=TRUE)
l <- do.call("list",mget(h))

# for(i in seq_along(l)) {
#   write.table(l[i], paste0("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/",names(l)[i], "_CLYB.bed"),
#               col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
# }

Get length/strand bias for CLIP virus-mapped reads

cat_beds <- dir("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll",pattern="cat*",full.names=TRUE)
cat_beds <- grep("*.bed$", cat_beds, value = T)
cat_count <- lapply(cat_beds, import, format = "BED")
names(cat_count) <- gsub("/Volumes/MyPassport/Reprocess_all_datasets/mirdeep_files/persisent_viruses_uncoll/cat_", "", cat_beds)
cat_width <-lapply(cat_count, function(x) data.frame(width = width(x), strand = x@strand))
cat_width <- rbindlist(cat_width, idcol = TRUE)
cat_width <- setNames(cat_width, c("sample", "width", "strand"))
sum_df <- cat_width  %>% group_by(.dots=c("sample","width", "strand")) %>% mutate(count = n()) %>% summarise_all(dplyr::first)

sum_df_short <- sum_df %>% filter(width < 25)

perc_df_short<- sum_df_short %>% group_by(sample) %>% mutate(nrow = sum(count), percent=count/nrow)

perc_df_short$percent <- ifelse(perc_df_short$strand=="-", perc_df_short$percent* -1, perc_df_short$percent*1)

Get length/strand bias for small RNA-seq reads

cat_beds <- dir("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped",pattern="cat*",full.names=TRUE)
cat_beds <- grep("*.bed$", cat_beds, value = T)
cat_count <- lapply(cat_beds, import, format = "BED")
names(cat_count) <- gsub("/Volumes/MyPassport/sRNA_seq_Aag2/uncollapsed_mapped/cat_", "", cat_beds)
cat_width <-lapply(cat_count, function(x) data.frame(width = width(x), strand = x@strand))
cat_width <- rbindlist(cat_width, idcol = TRUE)
cat_width <- setNames(cat_width, c("sample", "width", "strand"))
sum_df <- cat_width  %>% group_by(.dots=c("sample","width", "strand")) %>% mutate(count = n()) %>% summarise_all(dplyr::first)

sum_df_short <- sum_df %>% filter(width < 25)

perc_df_short_sRNA <- sum_df_short %>% group_by(sample) %>% mutate(nrow = sum(count), percent=count/nrow)

perc_df_short_sRNA$percent <- ifelse(perc_df_short_sRNA$strand=="-", perc_df_short_sRNA$percent* -1, perc_df_short_sRNA$percent*1)

Bind CLIP and small RNA-seq together and graph; Figures S5F-S5H

Input file “persistent_viruses_CLIP_smallRNA_seq.txt” is available in my Github

perc_df_short <- perc_df_short[!grepl("IgG", perc_df_short$sample),]
all_uncoll_short <- rbind(perc_df_short_sRNA, perc_df_short)
#write.table(all_uncoll_short, "/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/persistent_viruses_CLIP_smallRNA_seq.txt", col.names = T, row.names = F, quote = F, sep = "\t")

all_uncoll_short <- read.table("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/persistent_viruses_CLIP_smallRNA_seq.txt", header = T, sep = "\t")

ggplot(all_uncoll_short[grepl("CFAV", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

ggplot(all_uncoll_short[grepl("PCLV_L", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

ggplot(all_uncoll_short[grepl("PCLVM|PCLV_M", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

ggplot(all_uncoll_short[grepl("PCLV_S", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

ggplot(all_uncoll_short[grepl("CLY_a|CLYA", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

ggplot(all_uncoll_short[grepl("CLYB|CLY_B", all_uncoll_short$sample),], aes(x = width, y = percent, fill=sample, color=sample)) + geom_bar(stat = "identity", position=position_dodge()) +  scale_fill_manual(values = c("#1B0B80", "#8A0F09", "gray")) + scale_colour_manual(values = c("black", "black",  "black")) + theme_bw() + xlim(17,25)

## Warning: Removed 4 rows containing missing values (geom_bar).

reprocess_Aag2_small_RNA_seq

KRG

12/8/2020