Chimera_processing

Reverse map all small RNAs to reads that did not map to AaegL5

Input bams were mapped to the AaegL5 AGWG_LVP genome, see Processing_to_matrix script for details; the files are too large for Github, but are available upon requestdata available upon request

Additionally, see CLIPflexR documentation for:
unbam
bowtie2_index
bowtie_align
bamtobed

##get unmapped reads from bam
Dir <- "/rugpfs/fs0/rice_lab/scratch/krozen/AaegL5_mapped/unmapped_for_chimera"
bams <- dir(Dir,pattern="*.bam$",full.names = TRUE)
unbam <- function(bams){
  outfa <- paste0(dirname(bams), "/unmapped_", gsub(".bam", ".fa", basename(bams)))
  cmd <- paste0("samtools fasta -f 4 ", bams, " > ", outfa)
   system(cmd,wait = T)
}

#lapply(bams, unbam)

#make indices for mapping
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera"
unmapped <- dir(Dir,pattern="*.fa$",full.names = TRUE)
bowtie_index <- function(genomeFasta,
                         outFasta=gsub("\\.fa","",genomeFasta)
) {
  require(Rbowtie2)
  if(!dir.exists(outFasta)){
    bowtie2_build(references=genomeFasta, 
                  bt2Index=outFasta)
  }
  
}

#bplapply(unmapped, bowtie_index)

##align all known and putative small RNAs to reads
bowtie_align <- function(fq,index,sam=gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",index)
) {
  require(Rbowtie2)
  if(!dir.exists(sam)){
    bowtie2(bt2Index = index,
            samOutput = sam,
            seq1 = fq,"--threads 4 -f -L 18 -k 1000000")
    Rsamtools::asBam(sam,gsub("\\.sam","",sam))
  }
  
}

#Set Dir with unmapped fasta
Dir <- "/rugpfs/fs0/rice_lab/scratch/krozen/AaegL5_mapped/unmapped_for_chimera"
ref <- dir(Dir,pattern="*.fa$",full.names = TRUE)
ref <- gsub( ".fa", "", ref)

#for (i in ref) {
#  bowtie_align("/rugpfs/fs0/rice_lab/scratch/krozen/all_putative_known.fa", i)
#}

bam <- dir(Dir,pattern="*.bam$",full.names = TRUE)

bamtobed <- function(file,filtDup=FALSE){
  require(GenomicAlignments)
  require(rtracklayer)
  temp <- readGAlignments(file,
                          param=ScanBamParam(what = "qname"))
  names(temp) <- mcols(temp)$qname
  temp <- granges(temp)
  
  if(filtDup) temp <- temp[!duplicated(temp),]
  export.bed(temp,
             con = gsub("\\.bam",".bed",file))
}

#bplapply(bam, bamtobed)

#read in bed and chimera process
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)
chimera <- lapply(beds, function(x) {
   if (!file.size(x) == 0) {
       read.delim(x, header = FALSE, sep = "")
   }
})

names(chimera) <- beds
#use this to get empty beds and remove
t <- lapply(chimera, nrow) 
t <- unlist(t) 
t <- names(t) #get names of beds with entries and read these in 

test <- lapply(t, read.delim, header = FALSE, sep = "")
names(test) <- t 
col.names <- c("rowname", "start", "stop", "name", "score", "strand")
test <- lapply(test, setNames, col.names)

t <- gsub("known_novel_sRNA_revmapped/", "", t)
t <- gsub(".bed", ".fa", t) #so t has names of all fastas to read in 

Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera" 
fa <- dir(Dir, pattern="*.fa$",full.names = TRUE)
fasta <- lapply(fa, readDNAStringSet, format = "fasta", nrec = -1L)
names(fasta) <- fa 
fasta <- fasta[names(fasta) %in% t]

fasta <- lapply(fasta, as.data.frame)
fasta <- lapply(fasta, function(x) tibble::rownames_to_column(x))

#merge together read sequence and bed by rowname (read name)
chimera <- map2(fasta, test, ~merge(.x,.y, by = "rowname"))
names(chimera) <- gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/unmapped_", "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/unmapped_", names(chimera))

#write files to chimera inputs: bed with read name, start, stop, srand, read name, and read sequence

#for (x in names(chimera)) {
#write.table(chimera[[x]], file=paste0(x,".txt"), sep="\t", quote = FALSE)}
#write.table(test, file=".txt", sep="\t")

#process to keep read sequence downstream of small RNA sequence

Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped"
files <- dir(Dir, pattern="*fa.txt$",full.names = TRUE)

chimeraProcess <- function(input,exclude) {
  
  BR1 <- read.delim(input, header=T)
  BR1 <- BR1[!BR1$name %in% exclude,]
  BR1<- BR1[BR1$strand=="+",]
  BR1 <- BR1[duplicated(BR1$rowname)==F,]
  BR1$ups.seq <- mapply(substr, x=BR1$x, start=0, stop=BR1$start)
  BR1$dns.seq <- mapply(substr, x=BR1$x, start=BR1$stop+1, stop=nchar(as.character(BR1$x)))
  outname = paste(input, sep = "")
  outname = gsub(".fa.txt", "_chimera.txt", outname)
  outname = gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/", "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/", outname)
  write.table(BR1, outname, quote=F, sep="\t")
}

exclude <- c("AAAAAAAAAAAAAAAAAAAAAAAAA")
#this was an "miRNA" discovered by mirdeep that is uninformative, excluded

#for (i in 1:length(files)) {
#  chimeraProcess(files[i], exclude = exclude)
#}

#reformat output as fasta with only sequence downstream of small RNA 
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing"
files <- dir(Dir, pattern="*_chimera.txt$",full.names = TRUE)


reformat <- function(input) {
  BR1 <- read.delim(input, header=T)
  BR1$ID <- paste(BR1$rowname, BR1$name, sep = ";")
  BR1 <- BR1[c("ID","dns.seq")] 
  BR1$dns.seq <- as.character(BR1$dns.seq)
  BR1 <- BR1[nchar(BR1$dns.seq)>=18,]
  outname = paste(input, '.fa', sep = "")
  BR2 <- DNAStringSet(BR1$dns.seq, use.names = TRUE)
  names(BR2) <- BR1$ID
  writeXStringSet(BR2, outname, format ="fasta")  
}


#for (i in 1:length(files)) {
#  reformat(files[i])
#}

#now map downstream sequence to AaegL5 genome 

bowtie_align <- function(fq,index,sam=gsub("\\.fq|\\.fastq|\\.rm|\\.fa",".sam",fq)
) {
  require(Rbowtie2)
  if(!dir.exists(sam)){
    bowtie2(bt2Index = index,
            samOutput = sam,
            seq1 = fq,"--threads 4 -f -N 1 -L 18")
    Rsamtools::asBam(sam,gsub("\\.sam","",sam))
  }
  
}

Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing"
fqfiles <- dir(Dir, pattern= "*txt.fa", full.names = TRUE)
#bplapply(fqfiles, bowtie_align, index = "/Users/kathryn/Bowtie_indices/AaegL5/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5")

#convert remapped to BED:

bamtobed <- function(file,filtDup=FALSE){
  require(GenomicAlignments)
  require(rtracklayer)
  temp <- readGAlignments(file,
                          param=ScanBamParam(what = "qname"))
  names(temp) <- mcols(temp)$qname
  temp <- granges(temp)
  
  if(filtDup) temp <- temp[!duplicated(temp),]
  export.bed(temp,
             con = gsub("\\.bam",".bed",file))
}


Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
mapped <- dir(Dir,pattern="bam$",full.names = TRUE)
#bplapply(mapped, bamtobed)

All of the above chimera processing steps can now be accomplished using CLIPflexR’s chimera_Process function; bam files are the inputs and chimera processed, remapped beds are the outputs

Read in remapped chimeric reads, concatenate beds by group and write grouped bed/bigwigs

Additionally, see CLIPflexR documentation for:
CLIP_bw2

Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)

chimera <- lapply(beds, function(x) {
   if (!file.size(x) == 0) {
       read.delim(x, header = FALSE, sep = "")
   }
})

names(chimera) <- beds
#use this to remove beds where nothing remapped
t <- lapply(chimera, nrow) 
t <- unlist(t) 
t <- names(t) #get names and read these beds in 

chimera <- lapply(t, read.delim, header = FALSE, sep = "")
names(chimera ) <- t 

#get remapped chimeras by antibody/lysate and write out concatenated beds 
aegyptiAgo1 <- grep("aegypti_Ago1", t, value = TRUE)
aegyptiAgo2 <- grep("aegypti_Ago2", t, value = TRUE)
aegyptirIgG <- grep("aegypti_rIgG", t, value = TRUE)
aegyptimIgG <- grep("aegypti_mIgG", t, value = TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", t, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", t, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", t, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", t, value = TRUE)

aegyptiAgo1_chimera_remap <-chimera[c(aegyptiAgo1)]
aegyptiAgo2_chimera_remap <- chimera[c(aegyptiAgo2)]
aegyptirIgG_chimera_remap <- chimera[c(aegyptirIgG)]
aegyptimIgG_chimera_remap <- chimera[c(aegyptimIgG)]
Aag2Ago1_chimera_remap <- chimera[c(Aag2Ago1)]
Aag2Ago2_chimera_remap <- chimera[c(Aag2Ago2)]
Aag2rIgG_chimera_remap <- chimera[c(Aag2rIgG)]
Aag2mIgG_chimera_remap <- chimera[c(Aag2mIgG)]

aegyptiAgo1_chimera_remap <- do.call(rbind, aegyptiAgo1_chimera_remap)
aegyptiAgo2_chimera_remap <- do.call(rbind, aegyptiAgo2_chimera_remap)  
aegyptirIgG_chimera_remap <- do.call(rbind, aegyptirIgG_chimera_remap) 
aegyptimIgG_chimera_remap <- do.call(rbind, aegyptimIgG_chimera_remap) 
Aag2Ago1_chimera_remap <- do.call(rbind, Aag2Ago1_chimera_remap) 
Aag2Ago2_chimera_remap <- do.call(rbind, Aag2Ago2_chimera_remap) 
Aag2rIgG_chimera_remap <- do.call(rbind, Aag2rIgG_chimera_remap) 
Aag2mIgG_chimera_remap <- do.call(rbind,Aag2mIgG_chimera_remap) 

h <-grep("_chimera_remap",names(.GlobalEnv),value=TRUE)
h<- do.call("list",mget(h))

#for(i in seq_along(h)) {
#  write.table(h[i], paste(names(h)[i], ".bed", sep = ""), 
#              col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
#}

#make  bigwigs for chimera UTR coverage

pathToBed <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"

pathToBed <- dir(pathToBed, pattern= "*_remap.bed", full.names = TRUE)


makeCov <- function(pathToBed){
  require(magrittr)
  require(rtracklayer)
  test <- pathToBed %>% import.bed 
    coverage(test,weight = (1/length(test))) %>% export.bw(con=gsub("\\.bed","\\.bw",pathToBed))
}
setwd("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped")

#bplapply(pathToBed,makeCov)

Chimera_processing

KRG

7/14/2020

Reverse map all small RNAs to reads that did not map to AaegL5

Read in remapped chimeric reads, concatenate beds by group and write grouped bed/bigwigs