Ae. aegypti genome fasta (AaegL5) and repeat feature gff file used to be available from Vectorbase
However, I am unsure if the repeat feature gff file is still available there; it is too large to upload to Github, but is available upon request
# #had two piggybac entries, DNA/piggyBac and DNA/PiggyBac that was causing a problem, so named them all DNA/PiggyBac
#
# dddsb <- import("/Users/kathryn/Bowtie_indices/piggybac_name_change_Aedes-aegypti-LVP_AGWG_REPEATFEATURES_AaegL5.gff3")
# mySeq <- "/Users/kathryn/Bowtie_indices/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa"
# Rsamtools::indexFa(mySeq)
# myClassesb <- sort(unique(dddsb$class))
#
# #write fastas by repeat feature class
# for(i in 1:length(myClassesb)){
# temp <- dddsb[dddsb$class %in% myClassesb[i],] %>% unique
# fsfsfs <- Rsamtools::getSeq(Rsamtools::FaFile(mySeq),temp)
# names(fsfsfs) <- as.character(temp)
# fsfsfs %>% writeXStringSet(filepath = file.path(dirname(mySeq),paste0(make.names(myClassesb)[i],"_",basename(mySeq))))
# }
#
# #build indices for each repeat feature class
# for(i in 1:length(myClasses)){
# require(Rbowtie2)
# bowtie2_build(references=file.path(dirname(mySeq),paste0(make.names(myClasses)[i],"_",basename(mySeq))),
# bt2Index=gsub("\\.fa","",file.path(dirname(mySeq),paste0(make.names(myClasses)[i],"_",basename(mySeq)))))
# }
#
#
# fqfilles <- dir("/ru-auth/local/home/krozen/tocopy", pattern= "rm5_rm3.fa", full.names = FALSE)
# index<- gsub(".fa","",list.files("/rugpfs/fs0/rice_lab/scratch/krozen/Bowtie_indices_rename_unique",pattern=".fa$",full.names = TRUE))
#
# bplapply(index,function(x,fqfilles){
# library(Rbowtie2)
# fqFiles <- file.path("/ru-auth/local/home/krozen/tocopy",fqfilles)
#
# for(l in 1:length(fqFiles)){
# fq <- fqFiles[l]
# sam=file.path("/ru-auth/local/home/krozen/tocopy",
# paste0(gsub("\\.fq|\\.fastq|\\.rm|\\.fa","",basename(fq)), "_", gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",basename(x))))
# bowtie2(bt2Index = x,
# samOutput = sam,
# seq1 = fq,"--threads 4 -f -L 18 -N 1 ")
# Rsamtools::asBam(sam,gsub("\\.sam","",sam),overwrite = TRUE)
# }
# },fqfilles=fqfilles)
#
# #get percentage of mapped reads by repeat feature class
# bamFiles <- dir("/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped",pattern="*._.*_.*\\.bam$",full.names=TRUE)
#
# countByFlagStat <- function(bamFile){
# system(paste0("samtools flagstat ",bamFile," > ",paste0(bamFile,"_flagStats.txt")))
# }
# bplapply(bamFiles,countByFlagStat)
#
# FilesToRead <- dir("/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped",pattern="*._flagStats.txt$",full.names=TRUE)
#
# repName <- gsub(".*_rm5_rm3_|_Aedes-aegypti-.*","",basename(FilesToRead))
# sampleName <- gsub("_rm5_rm3_.*","",basename(FilesToRead))
#
# readRepeatMatrix <- data.frame(FilesToRead,repName,sampleName)
#
# parseFlagstat2 <- function(fileToR){
# flagstats <- read.delim(fileToR,sep=" ",stringsAsFactors=FALSE,header=FALSE)
# Total <- as.numeric(flagstats[1,1])
# Mapped <- as.numeric(flagstats[5,1])
# PercentMapped <- (Mapped/Total)*100
# return(data.frame(PercentMapped=PercentMapped))
# }
# Samples <- unique(readRepeatMatrix[,3])
# myRes <- list()
# for(i in 1:length(Samples)){
# myRes[[i]] <- unlist(sapply(as.vector(readRepeatMatrix[readRepeatMatrix[,3] %in% Samples[i],1]),parseFlagstat2))
# }
#
# #make dataframe with rownames as samples, colnames as repeat feature class,and entries are percentage of mapped reads
#
# fullRes <- do.call(rbind,myRes)
# colnames(fullRes) <- unique(readRepeatMatrix[,2])
# rownames(fullRes) <- Samples
# #save(fullRes,file="/rugpfs/fs0/rice_lab/scratch/krozen/RF_mapped/fullRes.RData")
output file “fullRes.RData” is available on my Github
load("/Users/kathryn/Reprocess_all_paper_datasets/RF/fullRes.RData")
fullRes <- t(fullRes)
#remove ones where all samples are 0
fullResdf <- as.data.frame(fullRes)
#define groups by Ab/lysate
samples <- colnames(fullResdf)
aegypti <- grep("Aae", samples, value = TRUE)
aegypti <- c(aegypti, grep("Aedes", samples, value=TRUE))
cells <- samples[!samples %in% aegypti]
cells_Ago1 <- grep("Ago1", cells, value = TRUE)
cells_Ago1 <- c(cells_Ago1, "KRG081018_Aag2_og_minus", "KRG081018_Aag2_minus")
cells <- cells[!cells %in% cells_Ago1]
cells_rIgG <- grep("rIgG", cells, value=TRUE)
cells_rIgG <- c(cells_rIgG, "KRG081018_Aag2_IgG_minus")
cells <- cells[!cells %in% cells_rIgG]
cells_mIgG <- grep("mIgG", cells, value=TRUE)
cells_Ago2 <- cells[!cells %in% cells_mIgG]
aegypti_Ago1 <- grep("Ago1", aegypti, value = TRUE)
aegypti_Ago1 <- c(aegypti_Ago1, grep("KRG081018", aegypti, value = TRUE))
aegypti <- aegypti[!aegypti %in% aegypti_Ago1]
aegypti_rIgG <- grep("rIgG", aegypti, value=TRUE)
aegypti <- aegypti[!aegypti %in% aegypti_rIgG ]
aegypti_mIgG <- grep("mIgG", aegypti, value=TRUE)
aegypti_Ago2 <- aegypti[!aegypti %in% aegypti_mIgG]
fullResdf$RF <- row.names(fullResdf)
fullResl <- melt(fullResdf, id.vars = c("RF"))
fullResl$group <- ifelse(fullResl$variable %in% aegypti_Ago1, paste0("aegypti_Ago1"), paste0("NA"))
fullResl$group <- ifelse(fullResl$variable %in% aegypti_rIgG, paste0("aegypti_rIgG"), paste0(fullResl$group ))
fullResl$group <- ifelse(fullResl$variable %in% aegypti_Ago2, paste0("aegypti_Ago2"), paste0(fullResl$group))
fullResl$group <- ifelse(fullResl$variable %in% aegypti_mIgG, paste0("aegypti_mIgG"), paste0(fullResl$group ))
fullResl$group <- ifelse(fullResl$variable %in% cells_Ago1, paste0("Aag2_Ago1"), paste0(fullResl$group ))
fullResl$group <- ifelse(fullResl$variable %in% cells_rIgG, paste0("Aag2_rIgG"), paste0(fullResl$group ))
fullResl$group <- ifelse(fullResl$variable %in% cells_Ago2, paste0("Aag2_Ago2"), paste0(fullResl$group ))
fullResl$group <- ifelse(fullResl$variable %in% cells_mIgG, paste0("Aag2_mIgG"), paste0(fullResl$group ))
#get average percent mapped by group
fullResdf$aegyptiAgo1_av <- rowMeans(fullResdf[aegypti_Ago1])
fullResdf$aegyptiAgo2_av <- rowMeans(fullResdf[aegypti_Ago2])
fullResdf$aegyptirIgG_av <- rowMeans(fullResdf[aegypti_rIgG])
fullResdf$aegyptimIgG_av <- rowMeans(fullResdf[aegypti_mIgG])
fullResdf$Aag2Ago1_av <- rowMeans(fullResdf[cells_Ago1])
fullResdf$Aag2Ago2_av <- rowMeans(fullResdf[cells_Ago2])
fullResdf$Aag2rIgG_av <- rowMeans(fullResdf[cells_rIgG])
fullResdf$Aag2mIgG_av <- rowMeans(fullResdf[cells_mIgG])
aegypti_Ago1_filt <- fullResdf %>% dplyr::filter(aegyptiAgo1_av >= 2 * aegyptirIgG_av & aegyptiAgo1_av!=0)
aegypti_Ago2_filt <- fullResdf %>% dplyr::filter(aegyptiAgo2_av >= 2 * aegyptimIgG_av & aegyptiAgo2_av!=0)
Aag2_Ago2_filt <- fullResdf %>% dplyr::filter(Aag2Ago2_av >= 2 * Aag2mIgG_av & Aag2Ago2_av !=0)
Aag2_Ago1_filt <- fullResdf %>% dplyr::filter(Aag2Ago1_av >=2 * Aag2rIgG_av & Aag2Ago2_av !=0)
filt <- rbind(aegypti_Ago1_filt, aegypti_Ago2_filt, Aag2_Ago2_filt, Aag2_Ago1_filt)
filt$sub <- duplicated(filt$RF)
filt <- filt[filt$sub==FALSE,]
tograph <- fullResl[fullResl$RF %in% filt$RF,]
#data was moved to prism and Student's t-tests were perfomed for repeat feature classes meeting filtering critera
#repeat features classes that were signficant were selected to graph - Academ and tRNA were not signficant but were included purely to get the R output visualization to look right
toselect <- c("DNA.Academ.1", "rRNA", "LTR.ERV1", "DNA.hAT.Blackjack", "SINE.tRNA.Deu", "DNA.Ginger", "Low_complexity", "tRNA")
Ago2_RF <- fullResl[fullResl$RF %in% toselect,]
Ago2_RF <- Ago2_RF[!grepl("Ago1", Ago2_RF$group),]
Ago2_RF <- Ago2_RF[!grepl("rIgG", Ago2_RF$group),]
Ago2_RF$group <- factor(Ago2_RF$group, levels = c("Aag2_mIgG", "Aag2_Ago2", "aegypti_mIgG", "aegypti_Ago2"))
ggplot(Ago2_RF, aes(fill=group, y=value, x=RF)) + geom_boxplot(outlier.colour = "gray", position=position_dodge(1.5)) + theme_bw() + facet_wrap(.~ RF, scales = "free_y") + scale_fill_manual(values=c("#F87F11", "#8A0F09", "#E9BA97", "#FA0F0C"))