Map all samples to repeat feature classes individually and get percentage of processed reads

Ae. aegypti genome fasta (AaegL5) and repeat feature gff file used to be available from Vectorbase
However, I am unsure if the repeat feature gff file is still available there; it is too large to upload to Github, but is available upon request

# #had two piggybac entries, DNA/piggyBac and DNA/PiggyBac that was  causing a problem, so named them all DNA/PiggyBac 
# 
# dddsb <- import("/Users/kathryn/Bowtie_indices/piggybac_name_change_Aedes-aegypti-LVP_AGWG_REPEATFEATURES_AaegL5.gff3")
# mySeq <- "/Users/kathryn/Bowtie_indices/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa"
# Rsamtools::indexFa(mySeq)
# myClassesb <- sort(unique(dddsb$class))
# 
# #write fastas by repeat feature class
# for(i in 1:length(myClassesb)){
#   temp <- dddsb[dddsb$class %in% myClassesb[i],] %>% unique
#   fsfsfs <- Rsamtools::getSeq(Rsamtools::FaFile(mySeq),temp)
#   names(fsfsfs)  <-  as.character(temp)
#   fsfsfs %>% writeXStringSet(filepath = file.path(dirname(mySeq),paste0(make.names(myClassesb)[i],"_",basename(mySeq))))
# }
# 
# #build indices for each repeat feature class
# for(i in 1:length(myClasses)){
#   require(Rbowtie2)
#   bowtie2_build(references=file.path(dirname(mySeq),paste0(make.names(myClasses)[i],"_",basename(mySeq))),
#                 bt2Index=gsub("\\.fa","",file.path(dirname(mySeq),paste0(make.names(myClasses)[i],"_",basename(mySeq)))))
# }
# 
# 
# fqfilles <- dir("/ru-auth/local/home/krozen/tocopy", pattern= "rm5_rm3.fa", full.names = FALSE)
# index<- gsub(".fa","",list.files("/rugpfs/fs0/rice_lab/scratch/krozen/Bowtie_indices_rename_unique",pattern=".fa$",full.names = TRUE))
# 
# bplapply(index,function(x,fqfilles){
#   library(Rbowtie2)
#   fqFiles <- file.path("/ru-auth/local/home/krozen/tocopy",fqfilles)
#   
#   for(l in 1:length(fqFiles)){
#     fq <- fqFiles[l]
#     sam=file.path("/ru-auth/local/home/krozen/tocopy",
#                   paste0(gsub("\\.fq|\\.fastq|\\.rm|\\.fa","",basename(fq)), "_", gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",basename(x))))
#     bowtie2(bt2Index = x,
#             samOutput = sam,
#             seq1 = fq,"--threads 4 -f -L 18 -N 1 ")
#     Rsamtools::asBam(sam,gsub("\\.sam","",sam),overwrite = TRUE)
#   }
# },fqfilles=fqfilles)
# 
# #get percentage of mapped reads by repeat feature class
# bamFiles <- dir("/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)
# 
# countByFlagStat <- function(bamFile){
#   system(paste0("samtools  flagstat ",bamFile," > ",paste0(bamFile,"_flagStats.txt")))
# }
# bplapply(bamFiles,countByFlagStat)
# 
# FilesToRead <- dir("/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped",pattern="*._flagStats.txt$",full.names=TRUE)
# 
# repName <- gsub(".*_rm5_rm3_|_Aedes-aegypti-.*","",basename(FilesToRead))
# sampleName <- gsub("_rm5_rm3_.*","",basename(FilesToRead))
# 
# readRepeatMatrix <- data.frame(FilesToRead,repName,sampleName)
# 
# parseFlagstat2 <- function(fileToR){
#   flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
#   Total <- as.numeric(flagstats[1,1])
#   Mapped <- as.numeric(flagstats[5,1])
#   PercentMapped <- (Mapped/Total)*100
#   return(data.frame(PercentMapped=PercentMapped))
# }
# Samples <- unique(readRepeatMatrix[,3])
# myRes <- list()
# for(i in 1:length(Samples)){
#   myRes[[i]] <- unlist(sapply(as.vector(readRepeatMatrix[readRepeatMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
# }
# 
# #make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads
# 
# fullRes <- do.call(rbind,myRes)
# colnames(fullRes) <- unique(readRepeatMatrix[,2])
# rownames(fullRes) <- Samples
# #save(fullRes,file="/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped/fullRes.RData")

output file “fullRes.RData” is available on my Github

Filter repeat feature classes where percentage mapped was significantly higher in AGO2 Ab sample compared to IgG control, Figure 6B

load("/Users/kathryn/Reprocess_all_paper_datasets/RF/fullRes.RData")
fullRes <- t(fullRes)
#remove ones where all samples are 0
fullResdf <- as.data.frame(fullRes)
#define groups by Ab/lysate
samples <- colnames(fullResdf)
aegypti <- grep("Aae", samples, value = TRUE)
aegypti <- c(aegypti, grep("Aedes", samples, value=TRUE))
cells <- samples[!samples %in% aegypti]
cells_Ago1 <- grep("Ago1", cells, value = TRUE)
cells_Ago1 <- c(cells_Ago1, "KRG081018_Aag2_og_minus", "KRG081018_Aag2_minus")
cells <- cells[!cells %in% cells_Ago1]
cells_rIgG <- grep("rIgG", cells, value=TRUE)
cells_rIgG <- c(cells_rIgG, "KRG081018_Aag2_IgG_minus")
cells <- cells[!cells %in% cells_rIgG]
cells_mIgG <- grep("mIgG", cells, value=TRUE)
cells_Ago2 <- cells[!cells %in% cells_mIgG]

aegypti_Ago1 <- grep("Ago1", aegypti, value = TRUE)
aegypti_Ago1 <- c(aegypti_Ago1, grep("KRG081018", aegypti, value = TRUE))
aegypti <- aegypti[!aegypti %in% aegypti_Ago1]
aegypti_rIgG <- grep("rIgG", aegypti, value=TRUE)
aegypti <- aegypti[!aegypti %in% aegypti_rIgG ]
aegypti_mIgG <- grep("mIgG", aegypti, value=TRUE)
aegypti_Ago2 <- aegypti[!aegypti %in% aegypti_mIgG]

fullResdf$RF <- row.names(fullResdf)
fullResl <- melt(fullResdf, id.vars = c("RF"))

fullResl$group <- ifelse(fullResl$variable %in%  aegypti_Ago1, paste0("aegypti_Ago1"), paste0("NA"))
fullResl$group <- ifelse(fullResl$variable %in%  aegypti_rIgG, paste0("aegypti_rIgG"), paste0(fullResl$group ))
fullResl$group  <- ifelse(fullResl$variable %in%  aegypti_Ago2, paste0("aegypti_Ago2"), paste0(fullResl$group))
fullResl$group  <- ifelse(fullResl$variable %in%  aegypti_mIgG, paste0("aegypti_mIgG"), paste0(fullResl$group ))
fullResl$group  <- ifelse(fullResl$variable %in%  cells_Ago1, paste0("Aag2_Ago1"), paste0(fullResl$group ))
fullResl$group  <- ifelse(fullResl$variable %in%  cells_rIgG, paste0("Aag2_rIgG"), paste0(fullResl$group ))
fullResl$group  <- ifelse(fullResl$variable %in%  cells_Ago2, paste0("Aag2_Ago2"), paste0(fullResl$group ))
fullResl$group  <- ifelse(fullResl$variable %in%  cells_mIgG, paste0("Aag2_mIgG"), paste0(fullResl$group ))

#get average percent mapped by group
fullResdf$aegyptiAgo1_av <- rowMeans(fullResdf[aegypti_Ago1])
fullResdf$aegyptiAgo2_av <- rowMeans(fullResdf[aegypti_Ago2])
fullResdf$aegyptirIgG_av <- rowMeans(fullResdf[aegypti_rIgG])
fullResdf$aegyptimIgG_av <- rowMeans(fullResdf[aegypti_mIgG])
fullResdf$Aag2Ago1_av <- rowMeans(fullResdf[cells_Ago1])
fullResdf$Aag2Ago2_av <- rowMeans(fullResdf[cells_Ago2])
fullResdf$Aag2rIgG_av <- rowMeans(fullResdf[cells_rIgG])
fullResdf$Aag2mIgG_av <- rowMeans(fullResdf[cells_mIgG])

aegypti_Ago1_filt <- fullResdf  %>% dplyr::filter(aegyptiAgo1_av >= 2 * aegyptirIgG_av & aegyptiAgo1_av!=0)
aegypti_Ago2_filt <- fullResdf  %>% dplyr::filter(aegyptiAgo2_av >= 2 * aegyptimIgG_av & aegyptiAgo2_av!=0)
Aag2_Ago2_filt <- fullResdf  %>% dplyr::filter(Aag2Ago2_av >= 2 * Aag2mIgG_av & Aag2Ago2_av !=0)
Aag2_Ago1_filt <- fullResdf  %>% dplyr::filter(Aag2Ago1_av >=2 * Aag2rIgG_av & Aag2Ago2_av !=0)

filt <- rbind(aegypti_Ago1_filt, aegypti_Ago2_filt, Aag2_Ago2_filt, Aag2_Ago1_filt)
filt$sub <- duplicated(filt$RF)
filt <- filt[filt$sub==FALSE,]
tograph <- fullResl[fullResl$RF %in% filt$RF,]

#data was moved to prism and Student's t-tests were perfomed for repeat feature classes meeting filtering critera
#repeat features classes that were signficant were selected to graph - Academ and tRNA were not signficant but were included purely to get the R output visualization to look right

toselect <- c("DNA.Academ.1", "rRNA", "LTR.ERV1", "DNA.hAT.Blackjack", "SINE.tRNA.Deu", "DNA.Ginger", "Low_complexity", "tRNA")
Ago2_RF <- fullResl[fullResl$RF %in% toselect,]
Ago2_RF <- Ago2_RF[!grepl("Ago1", Ago2_RF$group),]
Ago2_RF <- Ago2_RF[!grepl("rIgG", Ago2_RF$group),]
Ago2_RF$group <- factor(Ago2_RF$group, levels = c("Aag2_mIgG", "Aag2_Ago2", "aegypti_mIgG", "aegypti_Ago2"))

ggplot(Ago2_RF, aes(fill=group, y=value, x=RF))  + geom_boxplot(outlier.colour = "gray", position=position_dodge(1.5)) + theme_bw() + facet_wrap(.~ RF, scales = "free_y") + scale_fill_manual(values=c("#F87F11", "#8A0F09", "#E9BA97", "#FA0F0C"))