Input bams were mapped to the AaegL5 AGWG_LVP genome, see Processing_to_matrix script for details; the files are too large for Github, but are available upon requestdata available upon request
Additionally, see CLIPflexR documentation for:
unbam
bowtie2_index
bowtie_align
bamtobed
##get unmapped reads from bam
Dir <- "/rugpfs/fs0/rice_lab/scratch/krozen/AaegL5_mapped/unmapped_for_chimera"
bams <- dir(Dir,pattern="*.bam$",full.names = TRUE)
unbam <- function(bams){
outfa <- paste0(dirname(bams), "/unmapped_", gsub(".bam", ".fa", basename(bams)))
cmd <- paste0("samtools fasta -f 4 ", bams, " > ", outfa)
system(cmd,wait = T)
}
#lapply(bams, unbam)
#make indices for mapping
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera"
unmapped <- dir(Dir,pattern="*.fa$",full.names = TRUE)
bowtie_index <- function(genomeFasta,
outFasta=gsub("\\.fa","",genomeFasta)
) {
require(Rbowtie2)
if(!dir.exists(outFasta)){
bowtie2_build(references=genomeFasta,
bt2Index=outFasta)
}
}
#bplapply(unmapped, bowtie_index)
##align all known and putative small RNAs to reads
bowtie_align <- function(fq,index,sam=gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",index)
) {
require(Rbowtie2)
if(!dir.exists(sam)){
bowtie2(bt2Index = index,
samOutput = sam,
seq1 = fq,"--threads 4 -f -L 18 -k 1000000")
Rsamtools::asBam(sam,gsub("\\.sam","",sam))
}
}
#Set Dir with unmapped fasta
Dir <- "/rugpfs/fs0/rice_lab/scratch/krozen/AaegL5_mapped/unmapped_for_chimera"
ref <- dir(Dir,pattern="*.fa$",full.names = TRUE)
ref <- gsub( ".fa", "", ref)
#for (i in ref) {
# bowtie_align("/rugpfs/fs0/rice_lab/scratch/krozen/all_putative_known.fa", i)
#}
bam <- dir(Dir,pattern="*.bam$",full.names = TRUE)
bamtobed <- function(file,filtDup=FALSE){
require(GenomicAlignments)
require(rtracklayer)
temp <- readGAlignments(file,
param=ScanBamParam(what = "qname"))
names(temp) <- mcols(temp)$qname
temp <- granges(temp)
if(filtDup) temp <- temp[!duplicated(temp),]
export.bed(temp,
con = gsub("\\.bam",".bed",file))
}
#bplapply(bam, bamtobed)
#read in bed and chimera process
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)
chimera <- lapply(beds, function(x) {
if (!file.size(x) == 0) {
read.delim(x, header = FALSE, sep = "")
}
})
names(chimera) <- beds
#use this to get empty beds and remove
t <- lapply(chimera, nrow)
t <- unlist(t)
t <- names(t) #get names of beds with entries and read these in
test <- lapply(t, read.delim, header = FALSE, sep = "")
names(test) <- t
col.names <- c("rowname", "start", "stop", "name", "score", "strand")
test <- lapply(test, setNames, col.names)
t <- gsub("known_novel_sRNA_revmapped/", "", t)
t <- gsub(".bed", ".fa", t) #so t has names of all fastas to read in
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera"
fa <- dir(Dir, pattern="*.fa$",full.names = TRUE)
fasta <- lapply(fa, readDNAStringSet, format = "fasta", nrec = -1L)
names(fasta) <- fa
fasta <- fasta[names(fasta) %in% t]
fasta <- lapply(fasta, as.data.frame)
fasta <- lapply(fasta, function(x) tibble::rownames_to_column(x))
#merge together read sequence and bed by rowname (read name)
chimera <- map2(fasta, test, ~merge(.x,.y, by = "rowname"))
names(chimera) <- gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/unmapped_", "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/unmapped_", names(chimera))
#write files to chimera inputs: bed with read name, start, stop, srand, read name, and read sequence
#for (x in names(chimera)) {
#write.table(chimera[[x]], file=paste0(x,".txt"), sep="\t", quote = FALSE)}
#write.table(test, file=".txt", sep="\t")
#process to keep read sequence downstream of small RNA sequence
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped"
files <- dir(Dir, pattern="*fa.txt$",full.names = TRUE)
chimeraProcess <- function(input,exclude) {
BR1 <- read.delim(input, header=T)
BR1 <- BR1[!BR1$name %in% exclude,]
BR1<- BR1[BR1$strand=="+",]
BR1 <- BR1[duplicated(BR1$rowname)==F,]
BR1$ups.seq <- mapply(substr, x=BR1$x, start=0, stop=BR1$start)
BR1$dns.seq <- mapply(substr, x=BR1$x, start=BR1$stop+1, stop=nchar(as.character(BR1$x)))
outname = paste(input, sep = "")
outname = gsub(".fa.txt", "_chimera.txt", outname)
outname = gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/", "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/", outname)
write.table(BR1, outname, quote=F, sep="\t")
}
exclude <- c("AAAAAAAAAAAAAAAAAAAAAAAAA")
#this was an "miRNA" discovered by mirdeep that is uninformative, excluded
#for (i in 1:length(files)) {
# chimeraProcess(files[i], exclude = exclude)
#}
#reformat output as fasta with only sequence downstream of small RNA
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing"
files <- dir(Dir, pattern="*_chimera.txt$",full.names = TRUE)
reformat <- function(input) {
BR1 <- read.delim(input, header=T)
BR1$ID <- paste(BR1$rowname, BR1$name, sep = ";")
BR1 <- BR1[c("ID","dns.seq")]
BR1$dns.seq <- as.character(BR1$dns.seq)
BR1 <- BR1[nchar(BR1$dns.seq)>=18,]
outname = paste(input, '.fa', sep = "")
BR2 <- DNAStringSet(BR1$dns.seq, use.names = TRUE)
names(BR2) <- BR1$ID
writeXStringSet(BR2, outname, format ="fasta")
}
#for (i in 1:length(files)) {
# reformat(files[i])
#}
#now map downstream sequence to AaegL5 genome
bowtie_align <- function(fq,index,sam=gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",fq)
) {
require(Rbowtie2)
if(!dir.exists(sam)){
bowtie2(bt2Index = index,
samOutput = sam,
seq1 = fq,"--threads 4 -f -N 1 -L 18")
Rsamtools::asBam(sam,gsub("\\.sam","",sam))
}
}
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing"
fqfiles <- dir(Dir, pattern= "*txt.fa", full.names = TRUE)
#bplapply(fqfiles, bowtie_align, index = "/Users/kathryn/Bowtie_indices/AaegL5/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5")
#convert remapped to BED:
bamtobed <- function(file,filtDup=FALSE){
require(GenomicAlignments)
require(rtracklayer)
temp <- readGAlignments(file,
param=ScanBamParam(what = "qname"))
names(temp) <- mcols(temp)$qname
temp <- granges(temp)
if(filtDup) temp <- temp[!duplicated(temp),]
export.bed(temp,
con = gsub("\\.bam",".bed",file))
}
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
mapped <- dir(Dir,pattern="bam$",full.names = TRUE)
#bplapply(mapped, bamtobed)
All of the above chimera processing steps can now be accomplished using CLIPflexR’s chimera_Process function; bam files are the inputs and chimera processed, remapped beds are the outputs
Additionally, see CLIPflexR documentation for:
CLIP_bw2
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)
chimera <- lapply(beds, function(x) {
if (!file.size(x) == 0) {
read.delim(x, header = FALSE, sep = "")
}
})
names(chimera) <- beds
#use this to remove beds where nothing remapped
t <- lapply(chimera, nrow)
t <- unlist(t)
t <- names(t) #get names and read these beds in
chimera <- lapply(t, read.delim, header = FALSE, sep = "")
names(chimera ) <- t
#get remapped chimeras by antibody/lysate and write out concatenated beds
aegyptiAgo1 <- grep("aegypti_Ago1", t, value = TRUE)
aegyptiAgo2 <- grep("aegypti_Ago2", t, value = TRUE)
aegyptirIgG <- grep("aegypti_rIgG", t, value = TRUE)
aegyptimIgG <- grep("aegypti_mIgG", t, value = TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", t, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", t, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", t, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", t, value = TRUE)
aegyptiAgo1_chimera_remap <-chimera[c(aegyptiAgo1)]
aegyptiAgo2_chimera_remap <- chimera[c(aegyptiAgo2)]
aegyptirIgG_chimera_remap <- chimera[c(aegyptirIgG)]
aegyptimIgG_chimera_remap <- chimera[c(aegyptimIgG)]
Aag2Ago1_chimera_remap <- chimera[c(Aag2Ago1)]
Aag2Ago2_chimera_remap <- chimera[c(Aag2Ago2)]
Aag2rIgG_chimera_remap <- chimera[c(Aag2rIgG)]
Aag2mIgG_chimera_remap <- chimera[c(Aag2mIgG)]
aegyptiAgo1_chimera_remap <- do.call(rbind, aegyptiAgo1_chimera_remap)
aegyptiAgo2_chimera_remap <- do.call(rbind, aegyptiAgo2_chimera_remap)
aegyptirIgG_chimera_remap <- do.call(rbind, aegyptirIgG_chimera_remap)
aegyptimIgG_chimera_remap <- do.call(rbind, aegyptimIgG_chimera_remap)
Aag2Ago1_chimera_remap <- do.call(rbind, Aag2Ago1_chimera_remap)
Aag2Ago2_chimera_remap <- do.call(rbind, Aag2Ago2_chimera_remap)
Aag2rIgG_chimera_remap <- do.call(rbind, Aag2rIgG_chimera_remap)
Aag2mIgG_chimera_remap <- do.call(rbind,Aag2mIgG_chimera_remap)
h <-grep("_chimera_remap",names(.GlobalEnv),value=TRUE)
h<- do.call("list",mget(h))
#for(i in seq_along(h)) {
# write.table(h[i], paste(names(h)[i], ".bed", sep = ""),
# col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
#}
#make bigwigs for chimera UTR coverage
pathToBed <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
pathToBed <- dir(pathToBed, pattern= "*_remap.bed", full.names = TRUE)
makeCov <- function(pathToBed){
require(magrittr)
require(rtracklayer)
test <- pathToBed %>% import.bed
coverage(test,weight = (1/length(test))) %>% export.bw(con=gsub("\\.bed","\\.bw",pathToBed))
}
setwd("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped")
#bplapply(pathToBed,makeCov)