smallRNA_abundance_scatterplots

Use chimeras remapped to AaegL5 to count # of each small RNA by sample, group by antibody/lysate, and normalize for visualization

chimera bedfiles remapped to the AaegL5 genome, “chimera_beds.zip”, were generated in the chimera_process_to_upload script and are available on my Github

Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped"
beds <- dir(Dir, pattern="*.bed$",full.names = TRUE)
beds <- grep("KRG", beds, value = TRUE)

chimera <- lapply(beds, function(x) {
   if (!file.size(x) == 0) {
       read.delim(x, header = FALSE, sep = "")
   }
})

names(chimera) <- beds
#use this to remove beds where nothing remapped
t <- lapply(chimera, nrow) 
t <- unlist(t) 
t <- names(t) #get names and read these beds in 

chimera <- lapply(t, read.delim, header = FALSE, sep = "")
names(chimera ) <- t 

chimera_counts <- lapply(chimera, function(x) str_split_fixed(x$V4, ";", 2))
chimera_counts <- lapply(chimera_counts, function(x) x[,2])
chimera_counts <- lapply(chimera_counts, as.data.frame)
chimera_counts <- lapply(chimera_counts, setNames, nm = "smallRNA")
chimera_counts <- lapply(chimera_counts, function(x) x %>% group_by(smallRNA) %>% summarize(count=n()))

chimera_counts <- Reduce(function(x, y) merge(x, y, by = "smallRNA", all = TRUE), chimera_counts)

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the
## result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the
## result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are
## duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

## Warning in merge.data.frame(x, y, by = "smallRNA", all = TRUE): column
## names 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y',
## 'count.x', 'count.y' are duplicated in the result

chimera_counts[is.na(chimera_counts)] <- 0 

n <- gsub("/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera/known_novel_sRNA_revmapped/alternative_processing/AaegL5_remapped/", "", t)
n <- gsub("_chimera.txt.bed", "", n)
names(chimera) <- n

col.names <- c("smallRNA", n)
chimera_counts <- setNames(chimera_counts, col.names)

#get groups by antibody/lysate
aegyptiAgo1 <- grep("aegypti_Ago1", col.names, value = TRUE)
aegyptiAgo2 <- grep("aegypti_Ago2", col.names, value = TRUE)
aegyptirIgG <- grep("aegypti_rIgG", col.names, value = TRUE)
aegyptimIgG <- grep("aegypti_mIgG", col.names, value = TRUE)
Aag2Ago1 <- grep("Aag2_Ago1", col.names, value = TRUE)
Aag2Ago2 <- grep("Aag2_Ago2", col.names, value = TRUE)
Aag2rIgG <- grep("Aag2_rIgG", col.names, value = TRUE)
Aag2mIgG <- grep("Aag2_mIgG", col.names, value = TRUE)

#get total chimera counts for each small RNA by ab/lysate
chimera_counts$aegyptiAgo1 <- rowSums(chimera_counts[aegyptiAgo1])
chimera_counts$aegyptiAgo2 <- rowSums(chimera_counts[aegyptiAgo2])
chimera_counts$aegyptirIgG <- rowSums(chimera_counts[aegyptirIgG])
chimera_counts$aegyptimIgG <- rowSums(chimera_counts[aegyptimIgG])
chimera_counts$Aag2Ago1 <- rowSums(chimera_counts[Aag2Ago1])
chimera_counts$Aag2Ago2 <- rowSums(chimera_counts[Aag2Ago2])
chimera_counts$Aag2rIgG <- rowSums(chimera_counts[Aag2rIgG])
chimera_counts$Aag2mIgG <- rowSums(chimera_counts[Aag2mIgG])

#get biological complexity (BC; replicates) for each small RNA by ab/lysate
chimera_counts$aegypti_Ago1_BC <- apply(chimera_counts[aegyptiAgo1], 1, function(x) length(which(x>0)))
chimera_counts$aegypti_Ago2_BC <- apply(chimera_counts[aegyptiAgo2], 1, function(x) length(which(x>0)))
chimera_counts$aegypti_rIgG_BC <- apply(chimera_counts[aegyptirIgG], 1, function(x) length(which(x>0)))
chimera_counts$aegypti_mIgG_BC <- apply(chimera_counts[aegyptimIgG], 1, function(x) length(which(x>0)))
chimera_counts$Aag2_Ago1_BC <- apply(chimera_counts[Aag2Ago1], 1, function(x) length(which(x>0)))
chimera_counts$Aag2_Ago2_BC <- apply(chimera_counts[Aag2Ago2], 1, function(x) length(which(x>0)))
chimera_counts$Aag2_rIgG_BC <- apply(chimera_counts[Aag2rIgG], 1, function(x) length(which(x>0)))
chimera_counts$Aag2_mIgG_BC <- apply(chimera_counts[Aag2mIgG], 1, function(x) length(which(x>0)))

#get input reads (unmapped, before any processing) by group to normalize
#following lines are commented out, but were run to get values for normalization below
#Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/unmapped_for_chimera"
#ref <- dir(Dir,pattern="*.fa$",full.names = TRUE)


#A <- grep(".*(Ago1.*aegypti|aegypti.*Ago1).*", ref, value="TRUE")
#B <- grep(".*(Ago2.*aegypti|aegypti.*Ago2).*", ref, value="TRUE")
#C <- grep(".*(rIgG.*aegypti|aegypti.*rIgG).*", ref, value="TRUE")
#D <- grep(".*(mIgG.*aegypti|aegypti.*mIgG).*", ref, value="TRUE")
#E <- grep(".*(Ago1.*Aag2|Aag2.*Ago1).*", ref, value="TRUE")
#G <- grep(".*(Ago2.*Aag2|Aag2.*Ago2).*", ref, value="TRUE")
#H <- grep(".*(rIgG.*Aag2|Aag2.*rIgG).*", ref, value="TRUE")
#I <- grep(".*(mIgG.*Aag2|Aag2.*mIgG).*", ref, value="TRUE")


#fastaLength <- function(input) { 
#  require(Biostrings)
#  fa <- lapply(input, readDNAStringSet, format = "fasta", nrec = -1L) 
# t <- lapply(fa, length)
# t <- sum(unlist(t))
# print(t)
#}

#fastaLength(A) #1839545
#fastaLength(B) #762423
#fastaLength(C) #745977
#fastaLength(D) #402387
#fastaLength(E) #3368128
#fastaLength(G) #2964194
#fastaLength(H) #2632945
#fastaLength(I) #1423698

#add pseudocount so can calc log2FC 
chimera_counts$aegyptiAgo1_pseudocount <- rowSums(chimera_counts[aegyptiAgo1]) + 1
chimera_counts$aegyptiAgo2_pseudocount<- rowSums(chimera_counts[aegyptiAgo2])+1
chimera_counts$aegyptirIgG_pseudocount <- rowSums(chimera_counts[aegyptirIgG])+1
chimera_counts$aegyptimIgG_pseudocount <- rowSums(chimera_counts[aegyptimIgG])+1
chimera_counts$Aag2Ago1_pseudocount <- rowSums(chimera_counts[Aag2Ago1])+1
chimera_counts$Aag2Ago2_pseudocount <- rowSums(chimera_counts[Aag2Ago2])+1
chimera_counts$Aag2rIgG_pseudocount <- rowSums(chimera_counts[Aag2rIgG])+1
chimera_counts$Aag2mIgG_pseudocount <- rowSums(chimera_counts[Aag2mIgG])+1

#normalize to input reads, unmapped fasta 
chimera_counts$aegyptiAgo1_norm <- ((chimera_counts$aegyptiAgo1_pseudocount)/1839545)*1E6
chimera_counts$aegyptiAgo2_norm <- ((chimera_counts$aegyptiAgo2_pseudocount)/762423)*1E6
chimera_counts$aegyptirIgG_norm <- ((chimera_counts$aegyptirIgG_pseudocount)/745977)*1E6
chimera_counts$aegyptimIgG_norm <- ((chimera_counts$aegyptimIgG_pseudocount)/402387)*1E6
chimera_counts$Aag2Ago1_norm <- ((chimera_counts$Aag2Ago1_pseudocount)/3368128)*1E6
chimera_counts$Aag2Ago2_norm <- ((chimera_counts$Aag2Ago2_pseudocount)/2964194)*1E6
chimera_counts$Aag2rIgG_norm <- ((chimera_counts$Aag2rIgG_pseudocount)/2632945)*1E6
chimera_counts$Aag2mIgG_norm <- ((chimera_counts$Aag2mIgG_pseudocount)/1423698)*1E6


chimera_counts$log2aegyptiAgo1_norm_chimera <- log2(chimera_counts$aegyptiAgo1_norm)
chimera_counts$log2aegyptiAgo2_norm_chimera <- log2(chimera_counts$aegyptiAgo2_norm)
chimera_counts$log2aegyptirIgG_norm_chimera <- log2(chimera_counts$aegyptirIgG_norm)
chimera_counts$log2aegyptimIgG_norm_chimera <- log2(chimera_counts$aegyptimIgG_norm)
chimera_counts$log2Aag2Ago2_norm_chimera <- log2(chimera_counts$Aag2Ago2_norm)
chimera_counts$log2Aag2Ago1_norm_chimera <- log2(chimera_counts$Aag2Ago1_norm)
chimera_counts$log2Aag2rIgG_norm_chimera <- log2(chimera_counts$Aag2rIgG_norm)
chimera_counts$log2Aag2mIgG_norm_chimera <- log2(chimera_counts$Aag2mIgG_norm)

chimera_counts$log2FCAag2Ago1overAgo2_chimera <- log2(chimera_counts$Aag2Ago1_norm) - log2(chimera_counts$Aag2Ago2_norm)
chimera_counts$log2FCaegyptiAgo1overAgo2_chimera <- log2(chimera_counts$aegyptiAgo1_norm) - log2(chimera_counts$aegyptiAgo2_norm)

merge chimera counts and miRNA normalized counts together to make scatterplot

need raw miRNA counts to normalize with a pseudocount of 1 for log2 visualization: “novel_miRNAs_filtered_counts_all_info.txt”, “known_miRNAs_allcols.txt”, and “all_putative_known_sRNA_seeds2.txt” files were generated in the mirdeep2_processing_filtering_to_upload script All these input files are available in my Github

Input fasta “aae_miRNAs_mature_fixed.fa” was downloaded from miRBase and is available in my Github

sRNAs <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/novel_miRNAs_filtered_counts_all_info.txt", header=TRUE, sep = "\t")
miRNAs <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/known_miRNAs_allcols.txt", header=TRUE, sep = "\t")
x <- miRNAs[,c(1, 141:150, 123:124, 99:106)] 
z <- sRNAs[,c(1, 141:150, 123:124, 99:106)]

filt_sRNAs <- rbind(x,z)
col.names <- colnames(filt_sRNAs)
col.names <- gsub("counts_norm", "norm", col.names)
col.names <- gsub("miRNA", "smallRNA", col.names)
filt_sRNAs <- setNames(filt_sRNAs, col.names)

chim_merge <- merge(filt_sRNAs, chimera_counts, by = "smallRNA", all.x = TRUE)

seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds2.txt", header = TRUE, sep ="\t") 

to_merge <- seed_table[,c("Row.names", "FL", "six_mer", "six_mer_target")]
chim_merge <- merge(chim_merge, to_merge, by.x = "smallRNA", by.y="Row.names")

#get only novel that are related to known miRNA families in  aegypti
aae <- readDNAStringSet("/Users/kathryn/mirdeep2_master/aae_miRNAs_mature_fixed.fa", format = "fasta", nrec = -1L)
aae_seeds <- subseq(aae, start=2, end=NA, width=6)

chim_merge$related_to_aae <- ifelse(chim_merge$six_mer %in% aae_seeds & str_detect(chim_merge$smallRNA, "aae", negate=TRUE), paste0("yes"), paste0("no"))

Stat smooth func to add regression line to data

#add fun to add regression line with equation from ggplot
# source: https://gist.github.com/kdauria/524eade46135f6348140
stat_smooth_func <- function(mapping = NULL, data = NULL,
                             geom = "smooth", position = "identity",
                             ...,
                             method = "auto",
                             formula = y ~ x,
                             se = TRUE,
                             n = 80,
                             span = 0.75,
                             fullrange = FALSE,
                             level = 0.95,
                             method.args = list(),
                             na.rm = FALSE,
                             show.legend = NA,
                             inherit.aes = TRUE,
                             xpos = NULL,
                             ypos = NULL) {
  layer(
    data = data,
    mapping = mapping,
    stat = StatSmoothFunc,
    geom = geom,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(
      method = method,
      formula = formula,
      se = se,
      n = n,
      fullrange = fullrange,
      level = level,
      na.rm = na.rm,
      method.args = method.args,
      span = span,
      xpos = xpos,
      ypos = ypos,
      ...
    )
  )
}

StatSmoothFunc <- ggproto("StatSmooth", Stat,
                          
                          setup_params = function(data, params) {
                            # Figure out what type of smoothing to do: loess for small datasets,
                            # gam with a cubic regression basis for large data
                            # This is based on the size of the _largest_ group.
                            if (identical(params$method, "auto")) {
                              max_group <- max(table(data$group))
                              
                              if (max_group < 1000) {
                                params$method <- "loess"
                              } else {
                                params$method <- "gam"
                                params$formula <- y ~ s(x, bs = "cs")
                              }
                            }
                            if (identical(params$method, "gam")) {
                              params$method <- mgcv::gam
                            }
                            
                            params
                          },
                          
                          compute_group = function(data, scales, method = "auto", formula = y~x,
                                                   se = TRUE, n = 80, span = 0.75, fullrange = FALSE,
                                                   xseq = NULL, level = 0.95, method.args = list(),
                                                   na.rm = FALSE, xpos=NULL, ypos=NULL) {
                            if (length(unique(data$x)) < 2) {
                              # Not enough data to perform fit
                              return(data.frame())
                            }
                            
                            if (is.null(data$weight)) data$weight <- 1
                            
                            if (is.null(xseq)) {
                              if (is.integer(data$x)) {
                                if (fullrange) {
                                  xseq <- scales$x$dimension()
                                } else {
                                  xseq <- sort(unique(data$x))
                                }
                              } else {
                                if (fullrange) {
                                  range <- scales$x$dimension()
                                } else {
                                  range <- range(data$x, na.rm = TRUE)
                                }
                                xseq <- seq(range[1], range[2], length.out = n)
                              }
                            }
                            # Special case span because it's the most commonly used model argument
                            if (identical(method, "loess")) {
                              method.args$span <- span
                            }
                            
                            if (is.character(method)) method <- match.fun(method)
                            
                            base.args <- list(quote(formula), data = quote(data), weights = quote(weight))
                            model <- do.call(method, c(base.args, method.args))
                            
                            m = model
                            eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
                                             list(a = format(coef(m)[1], digits = 3), 
                                                  b = format(coef(m)[2], digits = 3), 
                                                  r2 = format(summary(m)$r.squared, digits = 3)))
                            func_string = as.character(as.expression(eq))
                            
                            if(is.null(xpos)) xpos = min(data$x)*0.9
                            if(is.null(ypos)) ypos = max(data$y)*0.9
                            data.frame(x=xpos, y=ypos, label=func_string)
                            
                          },
                          
                          required_aes = c("x", "y")
)

Visualize, Figures 5D and S4F

#filter on BC, remove Ago2 specific sRNAs
Ago1_chim_merge <- subset(chim_merge, chim_merge$aegypti_Ago1_BC > 2 | chim_merge$Aag2_Ago1_BC >2)
Ago1_chim_merge$aegypti_type <- as.character(Ago1_chim_merge$aegypti_type)
Ago1_chim_merge$Aag2_type <- as.character(Ago1_chim_merge$Aag2_type)

Ago1_chim_merge <- subset(Ago1_chim_merge, Ago1_chim_merge$aegypti_type!="Ago2" | Ago1_chim_merge$Aag2_type!="Ago2"  | Ago1_chim_merge$log2FCaegyptiAgo1overAgo2_chimera > 0 | Ago1_chim_merge$log2FCAag2Ago1overAgo2_chimera > 0 ) 


ggplot(aes(x=log2aegyptiAgo1_norm, y=log2aegyptiAgo1_norm_chimera), data=subset(Ago1_chim_merge, Ago1_chim_merge$aegypti_type!="Ago2")) + 
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(method="lm",se=FALSE, colour= "black") +     
  geom_point(aes(colour=log2FCaegyptiAgo1overAgo2_chimera, shape = aegypti_type, size = log2FCaegyptiAgo1overAgo2_chimera), stroke=1, alpha = 1) + scale_shape_manual(values = c(1, 19)) +
  scale_colour_gradient2(high="#3360A9", mid ="gray", low="#FA0F0C") + scale_size(range = c(1,4)) + theme_bw() + xlim(-2,17) + ylim(-2,17) +
  coord_equal() +
  labs(x = "Ago1 aegypti revmap (log2 normalized counts)", y = "Ago1 aegypti chimera (log2 normalized counts)") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14)) +
  geom_abline(intercept = 0, slope = 1, linetype="dashed") +  geom_text_repel(data = Ago1_chim_merge[grepl("TCACTA", Ago1_chim_merge$six_mer_target),], aes(label = smallRNA), size=2, nudge_x = 10, force = 10) ##if want to label, would do so like this

cor.test(subset(Ago1_chim_merge$log2aegyptiAgo1_norm, Ago1_chim_merge$aegypti_type!="Ago2"),subset(Ago1_chim_merge$log2aegyptiAgo1_norm_chimera, Ago1_chim_merge$aegypti_type!="Ago2" ), alternative = "t", method = c("pearson")) #p-value < 2.2e-16

## 
##  Pearson's product-moment correlation
## 
## data:  subset(Ago1_chim_merge$log2aegyptiAgo1_norm, Ago1_chim_merge$aegypti_type !=  and subset(Ago1_chim_merge$log2aegyptiAgo1_norm_chimera, Ago1_chim_merge$aegypti_type !=     "Ago2") and     "Ago2")
## t = 16.633, df = 109, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7844144 0.8924710
## sample estimates:
##       cor 
## 0.8469721

ggplot(aes(x=log2Aag2Ago1_norm, y=log2Aag2Ago1_norm_chimera), data=subset(Ago1_chim_merge, Ago1_chim_merge$Aag2_type!="Ago2")) + 
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(method="lm",se=FALSE, colour= "black") +     
  geom_point(aes(colour= log2FCAag2Ago1overAgo2_chimera, shape = Aag2_type, size = log2FCAag2Ago1overAgo2_chimera), stroke=1) + scale_shape_manual(values = c(1, 19)) + scale_size(range = c(1,4)) +
  scale_colour_gradient2(high = "#1B0B80", mid ="gray", low = "#8A0F09") +
  theme_bw() + xlim(-5,16) + ylim(-5,16) +
  coord_equal() +
  labs(x = "Ago1 Aag2 revmap (log2 normalized counts)", y = "Ago1 Aag2 chimera (log2 normalized counts)") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14)) +
  geom_abline(intercept = 0, slope = 1, linetype="dashed")  +  geom_text_repel(data = Ago1_chim_merge[grepl("GAATTG", Ago1_chim_merge$six_mer_target),], aes(label = smallRNA), size=2, nudge_x = 10, force = 10)

cor.test(subset(Ago1_chim_merge$log2Aag2Ago1_norm, Ago1_chim_merge$Aag2_type!="Ago2"),subset(Ago1_chim_merge$log2Aag2Ago1_norm_chimera, Ago1_chim_merge$Aag2_type!="Ago2" ), alternative = "t", method = c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  subset(Ago1_chim_merge$log2Aag2Ago1_norm, Ago1_chim_merge$Aag2_type !=  and subset(Ago1_chim_merge$log2Aag2Ago1_norm_chimera, Ago1_chim_merge$Aag2_type !=     "Ago2") and     "Ago2")
## t = 27.741, df = 125, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8984709 0.9484631
## sample estimates:
##      cor 
## 0.927506

#p-value < 2.2e-16

Figures S6C and S6E

Ago2_chim_merge <- subset(chim_merge, chim_merge$aegypti_Ago2_BC > 2 | chim_merge$Aag2_Ago2_BC >2)
Ago2_chim_merge$aegypti_type <- as.character(Ago2_chim_merge$aegypti_type)
Ago2_chim_merge$Aag2_type <- as.character(Ago2_chim_merge$Aag2_type)
Ago2_chim_merge <- subset(Ago2_chim_merge, Ago2_chim_merge$aegypti_type!="Ago1" | Ago2_chim_merge$Aag2_type!="Ago1" | Ago2_chim_merge$log2FCaegyptiAgo1overAgo2_chimera < 0 | Ago2_chim_merge$log2FCAag2Ago1overAgo2_chimera < 0)


ggplot(aes(x=log2aegyptiAgo2_norm, y=log2aegyptiAgo2_norm_chimera), data=subset(Ago2_chim_merge, Ago2_chim_merge$log2FCaegyptiAgo1overAgo2_chimera < 0 & Ago2_chim_merge$aegypti_type!="Ago1")) + 
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(method="lm",se=FALSE, colour= "black") +     
  geom_point(aes(colour=log2FCaegyptiAgo1overAgo2_chimera, shape = aegypti_type, size = log2FCaegyptiAgo1overAgo2_chimera), stroke=1) + scale_shape_manual(values = c(5, 19)) +
  scale_colour_gradient2(high="#3360A9", mid ="gray", low="#FA0F0C") + scale_size(trans= 'reverse', range = c(1,4)) +
  theme_bw() + xlim(-2,15) + ylim(-2,15) +
  coord_equal() +
  labs(x = "Ago2 aegypti revmap (log2 normalized counts)", y = "Ago2 aegypti chimera (log2 normalized counts)") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14)) +
  geom_abline(intercept = 0, slope = 1, linetype="dashed") + geom_text_repel(data = Ago2_chim_merge[grepl("ACCCAA", Ago2_chim_merge$six_mer_target),], aes(label = smallRNA), size=2, nudge_x = 10, force = 10)

cor.test(subset(Ago2_chim_merge$log2aegyptiAgo2_norm, Ago2_chim_merge$aegypti_type!="Ago1"),subset(Ago2_chim_merge$log2aegyptiAgo2_norm_chimera, Ago2_chim_merge$aegypti_type!="Ago1" ), alternative = "t", method = c("pearson")) #1.208e-09

## 
##  Pearson's product-moment correlation
## 
## data:  subset(Ago2_chim_merge$log2aegyptiAgo2_norm, Ago2_chim_merge$aegypti_type !=  and subset(Ago2_chim_merge$log2aegyptiAgo2_norm_chimera, Ago2_chim_merge$aegypti_type !=     "Ago1") and     "Ago1")
## t = 6.939, df = 75, p-value = 1.208e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4666663 0.7449374
## sample estimates:
##      cor 
## 0.625288

ggplot(aes(x=log2Aag2Ago2_norm, y=log2Aag2Ago2_norm_chimera), data=subset(Ago2_chim_merge, Ago2_chim_merge$log2FCAag2Ago1overAgo2_chimera < 0 & Ago2_chim_merge$Aag2_type!="Ago1")) + 
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(method="lm",se=FALSE, colour= "black") +     
  geom_point(aes(colour= log2FCAag2Ago1overAgo2_chimera, shape = Aag2_type, size = log2FCAag2Ago1overAgo2_chimera), stroke=1) + scale_shape_manual(values = c(5, 19)) + scale_size(trans= 'reverse', range = c(1,4)) +
  scale_colour_gradient2( mid ="gray") +
  theme_bw() + xlim(-4,15) + ylim(-4,15) +
  coord_equal() +
  labs(x = "Ago2 Aag2 revmap (log2 normalized counts)", y = "Ago2 Aag2 chimera (log2 normalized counts)") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14)) +
  geom_abline(intercept = 0, slope = 1, linetype="dashed") + geom_text_repel(data = Ago2_chim_merge[grepl("CAAATC", Ago2_chim_merge$six_mer_target),], aes(label = smallRNA), size=2, nudge_x = 10, force = 10)

cor.test(subset(Ago2_chim_merge$log2Aag2Ago2_norm, Ago2_chim_merge$Aag2_type!="Ago1"),subset(Ago2_chim_merge$log2Aag2Ago2_norm_chimera, Ago2_chim_merge$Aag2_type!="Ago1" ), alternative = "t", method = c("pearson")) #4.137e-14

## 
##  Pearson's product-moment correlation
## 
## data:  subset(Ago2_chim_merge$log2Aag2Ago2_norm, Ago2_chim_merge$Aag2_type !=  and subset(Ago2_chim_merge$log2Aag2Ago2_norm_chimera, Ago2_chim_merge$Aag2_type !=     "Ago1") and     "Ago1")
## t = 9.5864, df = 66, p-value = 4e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6410787 0.8472175
## sample estimates:
##       cor 
## 0.7628958

Format Table S4

Input file “novel_mirdeep_filtered.txt” file was generated in the mirdeep2_processing_filtering_to_upload script and is available in my Github

miRNAs fastas “aae_miRNAs_mature_fixed.fa”, “aga_miRNAs_mature.fa” (Anopheles gambiae), “cqu_mature_fixed.fa” (Culex quinquefasciatus), and “dme_miRNAs_mature.fa” (Drosophila melanogaster) were downloaded from miRBase and are available in my Github

#delete individual sample raw chimera counts
TS6 <- chim_merge[,c(1, 153:155, 14:21, 12:13, 111:152)]
TS6 <- TS6[,c(1:30,55:56)]

#get RPM without pseudocount (only needed that for log2 vis/log2FC)

#for small RNAs; see mirdeep2_processing_filtering_to_upload script for how I got normalization numbers
TS6$Aag2rIgG_counts_norm <- ((TS6$Aag2rIgG_counts)/1640381)*1E6
TS6$Aag2Ago1_counts_norm <- ((TS6$Aag2Ago1_counts)/7340095)*1E6
TS6$Aag2mIgG_counts_norm <- ((TS6$Aag2mIgG_counts)/1609376)*1E6
TS6$Aag2Ago2_counts_norm <- ((TS6$Aag2Ago2_counts)/4763515)*1E6
TS6$aegyptiAgo1_counts_norm <- ((TS6$aegyptiAgo1_counts)/4270728)*1E6
TS6$aegyptiAgo2_counts_norm <- ((TS6$aegyptiAgo2_counts)/928064)*1E6
TS6$aegyptirIgG_counts_norm <- ((TS6$aegyptirIgG_counts)/647502)*1E6
TS6$aegyptimIgG_counts_norm <- ((TS6$aegyptimIgG_counts)/356597)*1E6

TS6 <- TS6[,c(1:4,33:40, 13:32)]

#for small RNA chimeras

TS6$Aag2rIgG_norm_chimera <- ((TS6$Aag2rIgG)/2632945)*1E6
TS6$Aag2Ago1_norm_chimera <- ((TS6$Aag2Ago1)/3368128)*1E6
TS6$Aag2mIgG_norm_chimera <- ((TS6$Aag2mIgG)/1423698)*1E6
TS6$Aag2Ago2_norm_chimera <- ((TS6$Aag2Ago2)/2964194)*1E6
TS6$aegyptirIgG_norm_chimera <- ((TS6$aegyptirIgG)/745977)*1E6
TS6$aegyptiAgo1_norm_chimera <- ((TS6$aegyptiAgo1)/1839545)*1E6
TS6$aegyptimIgG_norm_chimera <- ((TS6$aegyptimIgG)/402387)*1E6
TS6$aegyptiAgo2_norm_chimera <- ((TS6$aegyptiAgo2)/762423)*1E6

TS6 <- TS6[,c(1:14,33:40, 31:32,29,27, 30, 28, 25, 23, 26, 24)]
col.names <- colnames(TS6)
col.names <- gsub("BC", "BC_chimera", col.names)
TS6 <- setNames(TS6, col.names)

#now merge with precursor locations, mirdeep score, and related species
mirdeep <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/mirdeep_results/novel_mirdeep_filtered.txt", header = TRUE, sep = "\t")
mirdeep_to_merge <- mirdeep[,c(4:5,11,13,22,18)]

TS6_m <- merge(TS6, mirdeep_to_merge, by.x = "FL", by.y = "DNA_seq", all.x = TRUE)
col.names <- colnames(TS6_m)
col.names <- c(col.names[1:32], "miRDeep2_score", "estimated_probability_smallRNA_is_true_positive", "signficant_Randfold", col.names[36:37])
TS6_m <- setNames(TS6_m, col.names)

#simplify randfold col
TS6_m$signficant_Randfold <- gsub('\\;.*', '', TS6_m$signficant_Randfold)

#format related species cols 
aae <- readDNAStringSet("/Users/kathryn/mirdeep2_master/aae_miRNAs_mature_fixed.fa", format = "fasta", nrec = -1L)
aae_seeds <- subseq(aae, start=2, end=NA, width=6)
aae_df <- as.data.frame(aae_seeds)
aae_df$related_aae_miRNAs <- row.names(aae_df)
names(aae_df)[names(aae_df)=="x"] <- "six_mer"
aae_df <- aae_df %>% group_by(six_mer) %>% summarise(aae_smallRNA_family =paste0(related_aae_miRNAs,collapse=":"))


cqu <- readDNAStringSet("/Users/kathryn/mirdeep2_master/cqu_mature_fixed.fa", format = "fasta", nrec = -1L)
cqu_seeds <- subseq(cqu, start=2, end=NA, width=6)
cqu_df <- as.data.frame(cqu_seeds)
cqu_df$related_cqu_miRNAs <- row.names(cqu_df)
names(cqu_df)[names(cqu_df)=="x"] <- "six_mer"
cqu_df$related_cqu_miRNAs <- str_split_fixed(cqu_df$related_cqu_miRNAs, " ", 2)[,1]
cqu_df$related_cqu_miRNAs <- gsub("cqT", "cqu", cqu_df$related_cqu_miRNAs)
cqu_df <- cqu_df %>% group_by(six_mer) %>% summarise(cqu_related_miRNA_family =paste0(related_cqu_miRNAs,collapse=":"))


aga <- readDNAStringSet("/Users/kathryn/mirdeep2_master/aga_miRNAs_mature.fa", format = "fasta", nrec = -1L)
#get pos 2-7
aga_seeds <- subseq(aga, start=2, end=NA, width=6)
aga_df <- as.data.frame(aga_seeds)
aga_df$related_aga_miRNAs <- row.names(aga_df)
names(aga_df)[names(aga_df)=="x"] <- "six_mer"
aga_df$related_aga_miRNAs <- str_split_fixed(aga_df$related_aga_miRNAs, " ", 2)[,1]
aga_df <- aga_df %>% group_by(six_mer) %>% summarise(aga_related_miRNA_family =paste0(related_aga_miRNAs,collapse=":"))

dme <- readDNAStringSet("/Users/kathryn/mirdeep2_master/dme_miRNAs_mature.fa", format = "fasta", nrec = -1L)
#get pos 2-7
dme_seeds <- subseq(dme, start=2, end=NA, width=6)
dme_df <- as.data.frame(dme_seeds)
dme_df$related_dme_miRNAs <- row.names(dme_df)
names(dme_df)[names(dme_df)=="x"] <- "six_mer"
dme_df$related_dme_miRNAs <- str_split_fixed(dme_df$related_dme_miRNAs , " ", 2)[,1]
dme_df <- dme_df %>% group_by(six_mer) %>% summarise(dme_related_miRNA_family =paste0(related_dme_miRNAs,collapse=":"))

TS6_m <- merge(TS6_m, aae_df, by = "six_mer", all.x=TRUE)
TS6_m <- merge(TS6_m, cqu_df, by = "six_mer", all.x=TRUE)
TS6_m <- merge(TS6_m, aga_df, by = "six_mer", all.x=TRUE)
TS6_m <- merge(TS6_m, dme_df, by = "six_mer", all.x=TRUE)

TS6_m <- TS6_m[,c(3, 2, 1, 4:35, 37:41)]
names(TS6_m)[names(TS6_m)=="FL"] <- "sequence"

#last thing is need to clean up score and probability column
getmax = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
  lapply(.,function(x) max(as.numeric(x),na.rm = F) ) %>%
  unlist()

TS6_m$miRDeep2_score <- getmax(TS6_m$miRDeep2_score)

t <- str_split(TS6_m$estimated_probability_smallRNA_is_true_positive , "; ")
b <- vector("list", length(t))
for(i in 1:length(t)){
   b[[i]]  <- unique(t[[i]])
  b[[i]] <-  b[[i]][b[[i]]!="NA"]
}

TS6_m$estimated_probability_smallRNA_is_true_positive <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))

t <- str_split(TS6_m$precursor.coordinate , "; ")
b <- vector("list", length(t))
for(i in 1:length(t)){
   b[[i]]  <- unique(t[[i]])
  b[[i]] <-  b[[i]][b[[i]]!="NA"]
}

TS6_m$precursor.coordinate <- unlist(lapply(b,function(x) paste0(x, collapse="; ")))
names(TS6_m)[names(TS6_m)=="precursor.coordinate"] <- "precursor_coordinate"

Get top lists for known and novel and rename top

TS <- TS6_m

##Get top most abundant known in mosquito
TSk <- TS[grepl("aae", TS$smallRNA),]
topAgo1aae <- TSk  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = dplyr::first(aae_smallRNA_family), aegyptiAgo1_counts_norm = sum(aegyptiAgo1_counts_norm)) %>% top_n(10)

## Selecting by aegyptiAgo1_counts_norm

topAgo1aaek <- DNAStringSet(topAgo1aae$six_mer_target)
names(topAgo1aaek) <-topAgo1aae$aae_smallRNA_family 
#writeXStringSet(topAgo1aaek, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_aegypti_Ago1_known.fa", format = "fasta")

topAgo1cells <- TSk  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = dplyr::first(aae_smallRNA_family), Aag2Ago1_counts_norm = sum(Aag2Ago1_counts_norm)) %>% top_n(10)

## Selecting by Aag2Ago1_counts_norm

topAgo1cellsk <- DNAStringSet(topAgo1cells$six_mer_target)
names(topAgo1cellsk) <-topAgo1cells$aae_smallRNA_family 
#writeXStringSet(topAgo1cellsk, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_cells_Ago1_known.fa", format = "fasta")  ##conf

#Get most abundant novel small RNAs

TSnovel <- TS[is.na(TS$aae_smallRNA_family),] #take completely novel families

##Ago1 novel aegypti
topAgo1aegypti <- TSnovel  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), aegyptiAgo1_counts_norm = sum(aegyptiAgo1_counts_norm)) %>% top_n(10)

## Selecting by aegyptiAgo1_counts_norm

topAgo1aaen <- DNAStringSet(topAgo1aegypti$six_mer_target)
names(topAgo1aaen) <-topAgo1aegypti$aae_smallRNA_family 
#writeXStringSet(topAgo1aaen, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_aegypti_Ago1_novel.fa", format = "fasta")

##Ago1 novel cells
topAgo1cells <- TSnovel  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), Aag2Ago1_counts_norm = sum(Aag2Ago1_counts_norm)) %>% top_n(10) ##good!

## Selecting by Aag2Ago1_counts_norm

topAgo1cellsn <- DNAStringSet(topAgo1cells$six_mer_target)
names(topAgo1cellsn) <-topAgo1cells$aae_smallRNA_family 
#writeXStringSet(topAgo1cellsn, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_cells_Ago1_novel.fa", format = "fasta")

#chimera Ago1
novel_Ago1_chim_merge <- Ago1_chim_merge[!grepl("aae", Ago1_chim_merge$smallRNA),]
novel_Ago1_chim_merge <- subset(novel_Ago1_chim_merge, novel_Ago1_chim_merge$related_to_aae=="no")

#mosquito
topAgo1aegyptichim <- novel_Ago1_chim_merge  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), aegyptiAgo1_norm_chimera = sum(aegyptiAgo1_norm)) %>% top_n(10)

## Selecting by aegyptiAgo1_norm_chimera

topAgo1aae_chim <- DNAStringSet(topAgo1aegyptichim$six_mer_target)
names(topAgo1aae_chim) <-topAgo1aegyptichim$aae_smallRNA_family 
#writeXStringSet(topAgo1aae_chim, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_aegypti_Ago1_novel_chimera.fa", format = "fasta")

#cells
Ago1cellchim <- novel_Ago1_chim_merge  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), Aag2Ago1_norm_chimera = sum(Aag2Ago1_norm)) %>% top_n(10)

## Selecting by Aag2Ago1_norm_chimera

topAgo1cellchim <- DNAStringSet(Ago1cellchim$six_mer_target)
names(topAgo1cellchim) <-Ago1cellchim$aae_smallRNA_family 
#writeXStringSet(topAgo1cellchim, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_cells_Ago1_novel_chimera2.fa", format = "fasta")

#Ago1 novel master lists

#aegypti
topAgo1aegypti_all <- c(topAgo1aae_chim,  topAgo1aaen)
topAgo1aegypti_all <- unique(topAgo1aegypti_all)
#writeXStringSet(topAgo1cellchim, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/aegypti_Ago1_novel.fa", format = "fasta")

#cells
topAgo1cells_all <- c(topAgo1cellsn,topAgo1cellchim )
topAgo1cells_all <- unique(topAgo1cells_all)
#writeXStringSet(topAgo1cells_all, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/cells_Ago1_novel2.fa", format = "fasta")


##all Ago1 novel together
all_Ago1n_top <- c(topAgo1cells_all , topAgo1aegypti_all)
all_Ago1n_top  <- unique(all_Ago1n_top)

allAgo1_n_k <- c(topAgo1cellsk, topAgo1aaek, all_Ago1n_top )
allAgo1_n_k <- unique(allAgo1_n_k)
#writeXStringSet(allAgo1_n_k , "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/all_Ago1_known_and_novel_rn2.fa", format = "fasta")


##Ago2

##Ago2 known
topAgo2k <- TSk  %>% filter(log2FCaegyptiAgo1overAgo2 < 0 & log2FCAag2Ago1overAgo2 < 0 ) %>% group_by(six_mer_target)%>%  summarise(aae_smallRNA_family = dplyr::first(aae_smallRNA_family))

topAgo2bothk <- DNAStringSet(topAgo2k$six_mer_target)
names(topAgo2bothk) <-topAgo2k$aae_smallRNA_family 

##Ago2 novel aegypti
topAgo2aegypti  <- TSnovel  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), aegyptiAgo2_counts_norm = sum(aegyptiAgo2_counts_norm)) %>% top_n(10)

## Selecting by aegyptiAgo2_counts_norm

topAgo2aaen <- DNAStringSet(topAgo2aegypti$six_mer_target)
names(topAgo2aaen) <-topAgo2aegypti$aae_smallRNA_family 
#writeXStringSet(topAgo2aaen, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_aegypti_Ago2_novel.fa", format = "fasta")

#cells
topAgo2cellsdf  <- TSnovel  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), Aag2Ago2_counts_norm = sum(Aag2Ago2_counts_norm)) %>% top_n(10)

## Selecting by Aag2Ago2_counts_norm

topAgo2cellsn <- DNAStringSet(topAgo2cellsdf$six_mer_target)
names(topAgo2cellsn) <-topAgo2cellsdf$aae_smallRNA_family 
#writeXStringSet(topAgo2cellsn, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_cells_Ago2_novel.fa", format = "fasta")

#chimera top for Ago2
novel_Ago2_chim_merge <- Ago2_chim_merge[!grepl("aae", Ago2_chim_merge$smallRNA),]
novel_Ago2_chim_merge <- subset(novel_Ago2_chim_merge, novel_Ago2_chim_merge$related_to_aae=="no")

#mosqutio
topAgo2aegyptichim <- novel_Ago2_chim_merge  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), aegyptiAgo2_norm_chimera = sum(aegyptiAgo2_norm)) %>% top_n(10)

## Selecting by aegyptiAgo2_norm_chimera

##11 cause of tie, randomly throw one away 
topAgo2aegyptichim <- topAgo2aegyptichim[topAgo2aegyptichim$six_mer_target!= "CCTTCT",]
topAgo2aae_chim <- DNAStringSet(topAgo2aegyptichim$six_mer_target)
names(topAgo2aae_chim) <-topAgo2aegyptichim$aae_smallRNA_family 
#writeXStringSet(topAgo2aae_chim, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_aegypti_Ago2_novel_chimera2.fa", format = "fasta")

#cells
Ago2cellchim <- novel_Ago2_chim_merge  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), log2Aag2Ago2_norm_chimera = sum(Aag2Ago2_norm)) %>% top_n(10)

## Selecting by log2Aag2Ago2_norm_chimera

topAgo2cellchim <- DNAStringSet(Ago2cellchim$six_mer_target)
names(topAgo2cellchim) <-Ago2cellchim$aae_smallRNA_family 
#writeXStringSet(topAgo1cellchim, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/top10_cells_Ago2_novel_chimera.fa", format = "fasta")

##all aegypti Ago2 novel 
topAgo2aegypti <- c(topAgo2aae_chim,topAgo2aaen )
topAgo2aegypti <- unique(topAgo2aegypti)
#writeXStringSet(topAgo2aegypti, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/aegypti_Ago2_novel2.fa", format = "fasta")

##Ago2 novel cells
topAgo2cells <- c(topAgo2cellchim,topAgo2cellsn)
topAgo2cells <- unique(topAgo2cells)
#writeXStringSet(topAgo2cells , "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/cells_Ago2_novel.fa", format = "fasta")

##all Ago2 together
all_Ago2n_top <- c(topAgo2cells , topAgo2aegypti)
all_Ago2n_top  <- unique(all_Ago2n_top )
#writeXStringSet(all_Ago2n_top, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/all_Ago2_novel_rn.fa", format = "fasta")

allAgo2_n_k <- c(topAgo2bothk, all_Ago2n_top)
allAgo2_n_k <- unique(allAgo2_n_k)
#writeXStringSet(allAgo2_n_k, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/all_Ago2_known_novel_rn2.fa", format = "fasta")

#get known miRNA with same average abundance as novel 
TSg <- TSnovel %>% group_by(six_mer_target) %>% summarise(aegypti_fam_ab = sum(aegyptiAgo1_counts_norm), cells_fam_ab = sum(Aag2Ago1_counts_norm))

TSg[TSg$six_mer_target %in% topAgo1aaen,]  %>% summarize(mean = mean(aegypti_fam_ab)) #1472

## Error in get(genname, envir = envir) : object 'vec_ptype2' not found
## # A tibble: 1 x 1
##    mean
##   <dbl>
## 1 1472.

TSg[TSg$six_mer_target %in% topAgo1cellsn,]  %>% summarize(mean = mean(cells_fam_ab)) #717

## # A tibble: 1 x 1
##    mean
##   <dbl>
## 1  717.

TSK_g <- TSk %>% group_by(six_mer_target) %>% summarise(aegypti_fam_ab = sum(aegyptiAgo1_counts_norm), cells_fam_ab = sum(Aag2Ago1_counts_norm), aae_smallRNA_family = dplyr::first(aae_smallRNA_family ))
known_miRNAs_novelab_cells <- TSK_g %>% filter(cells_fam_ab > 567 & cells_fam_ab< 867)
known_miRNAs_novelab_Aag2 <- DNAStringSet(known_miRNAs_novelab_cells$six_mer_target)
names(known_miRNAs_novelab_Aag2) <- known_miRNAs_novelab_cells$aae_smallRNA_family

#writeXStringSet(known_miRNAs_novelab_Aag2, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/known_miRNAs_novelab_cells2.fa", format = "fasta")

known_miRNAs_novelab_mosq <- TSK_g %>% filter(aegypti_fam_ab > 1322 & aegypti_fam_ab< 1622)
known_miRNAs_novelab_aae <- DNAStringSet(known_miRNAs_novelab_mosq$six_mer_target)
names(known_miRNAs_novelab_aae) <- known_miRNAs_novelab_mosq$aae_smallRNA_family

#writeXStringSet(known_miRNAs_novelab_aae, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/known_miRNAs_novelab_mos2.fa", format = "fasta")

##need to name top ones
TSnovelfam <- TSnovel  %>% group_by(six_mer_target) %>% summarise(aae_smallRNA_family = paste0(smallRNA, collapse = ":"), aegyptiAgo1_counts_norm = sum(aegyptiAgo1_counts_norm), aegyptiAgo2_counts_norm = sum(aegyptiAgo2_counts_norm), Aag2Ago1_counts_norm = sum(Aag2Ago1_counts_norm), Aag2Ago2_counts_norm = sum(Aag2Ago2_counts_norm))

TSnovelfam$topAgo2c <- ifelse(TSnovelfam$six_mer_target %in% topAgo2cells & TSnovelfam$six_mer_target %in% topAgo2aegypti, paste0("common-Ago2-novel-fam"), NA)
TSnovelfam$topAgo2c <- ifelse(TSnovelfam$six_mer_target %in% topAgo2cells & !TSnovelfam$six_mer_target %in% topAgo2aegypti, paste0("Aag2-Ago2-novel-fam"), paste0(TSnovelfam$topAgo2c))
TSnovelfam$topAgo2c <- ifelse(!TSnovelfam$six_mer_target %in% topAgo2cells & TSnovelfam$six_mer_target %in% topAgo2aegypti, paste0("aegypti-Ago2-novel-fam"), paste0(TSnovelfam$topAgo2c))

TSnovelfam$topAgo1c <- ifelse(TSnovelfam$six_mer_target %in% topAgo1cells_all & TSnovelfam$six_mer_target %in% topAgo1aegypti_all, paste0("common-Ago1-novel-fam"), NA)
TSnovelfam$topAgo1c <- ifelse(TSnovelfam$six_mer_target %in% topAgo1cells_all & !TSnovelfam$six_mer_target %in% topAgo1aegypti_all, paste0("Aag2-Ago1-novel-fam"), paste0(TSnovelfam$topAgo1c))
TSnovelfam$topAgo1c <- ifelse(!TSnovelfam$six_mer_target %in% topAgo1cells_all & TSnovelfam$six_mer_target %in% topAgo1aegypti_all, paste0("aegypti-Ago1-novel-fam"), paste0(TSnovelfam$topAgo1c))

TSnovelfam <- TSnovelfam[grepl("novel", TSnovelfam$topAgo2c) | grepl("novel", TSnovelfam$topAgo1c),]

#rank by abundance to name; if novel small RNA is specific, name by appropriate sample overall abundance; if in cells and mosquitoes, rank by average abundance

TSnovelfam <- TSnovelfam %>% rowwise()  %>% mutate(avAgo1 = mean(c(aegyptiAgo1_counts_norm, Aag2Ago1_counts_norm)))

TSnovelfam <- TSnovelfam %>% rowwise()  %>% mutate(avAgo2 = mean(c(aegyptiAgo2_counts_norm, Aag2Ago2_counts_norm)))

TSnovelfam$avAgo1 <- ifelse(grepl("aegypti", TSnovelfam$topAgo1c), as.numeric(paste0(TSnovelfam$aegyptiAgo1_counts_norm)), as.numeric(paste0(TSnovelfam$avAgo1)))

TSnovelfam$avAgo1 <- ifelse(grepl("Aag2", TSnovelfam$topAgo1c), as.numeric(paste0(TSnovelfam$Aag2Ago1_counts_norm)), as.numeric(paste0(TSnovelfam$avAgo1)))

TSnovelfam$avAgo2 <- ifelse(grepl("aegypti", TSnovelfam$topAgo2c), as.numeric(paste0(TSnovelfam$aegyptiAgo2_counts_norm)), as.numeric(paste0(TSnovelfam$avAgo2)))

TSnovelfam$avAgo2 <- ifelse(grepl("Aag2", TSnovelfam$topAgo2c), as.numeric(paste0(TSnovelfam$Aag2Ago2_counts_norm)), as.numeric(paste0(TSnovelfam$avAgo2)))

TSnovelfam <-  TSnovelfam %>% group_by(topAgo1c) %>% mutate(Ago1rank = min_rank(dplyr::desc(avAgo1)))

## Warning: Grouping rowwise data frame strips rowwise nature

TSnovelfam <-  TSnovelfam %>% group_by(topAgo2c) %>% mutate(Ago2rank = min_rank(dplyr::desc(avAgo2)))

TSnovelfam$name_test <- ifelse(grepl("-", TSnovelfam$topAgo1c), paste0(TSnovelfam$topAgo1c, TSnovelfam$Ago1rank), NA)

TSnovelfam$name_test2 <- ifelse(grepl("-", TSnovelfam$topAgo2c), paste0(TSnovelfam$topAgo2c, TSnovelfam$Ago2rank), NA)

TSnovelfam$all_nams <- ifelse(!is.na(TSnovelfam$name_test) & !is.na(TSnovelfam$name_test2), paste0(TSnovelfam$name_test,  "/", TSnovelfam$name_test2), NA)
TSnovelfam$all_nams <- ifelse(is.na(TSnovelfam$name_test) & !is.na(TSnovelfam$name_test2), paste0( TSnovelfam$name_test2), TSnovelfam$all_nams)
TSnovelfam$all_nams <- ifelse(!is.na(TSnovelfam$name_test) & is.na(TSnovelfam$name_test2), paste0(TSnovelfam$name_test),TSnovelfam$all_nams)

TSnovelfam$name_test <- TSnovelfam$all_nams
TSnovelfam$all_nams <- NULL
TSnovelfam$name_test2 <- NULL

Name all other novel small RNAs

#Now name all others; if related to a known aegypti miRNA, name with that family name and rank according to length for individual members
#if no related to any known aegypti miRNA, group by family, name family randomly and rank according to length for individual members
TS6_m <- merge(TS6_m, TSnovelfam[,c("six_mer_target", "name_test")], by = "six_mer_target", all.x=TRUE)

TS6_d <- TS6_m
TS6_d$length <-  nchar(as.character(TS6_d$sequence), type = "chars")

#need to group by family and name individual novel from top ; a,b,c etc.
#make col for length
known <- TS6_d[grepl("aae", TS6_d$smallRNA),]
known$ID <- paste0(known$smallRNA)
known$name_test <- str_split_fixed(known$aae_smallRNA_family, ":", 2)[,1]
known$name_test  <- sub("a$", "", known$name_test )
known$name_test  <- sub("b$", "", known$name_test )
known$name_test  <- sub("c$", "", known$name_test )

top <- TS6_d[!is.na(TS6_d$name_test),]
top<- top %>% group_by(six_mer) %>% arrange(length) %>% mutate(rank = order(length))
top$rank <- chartr("123456789", "abcdefghi", top$rank)
top$ID <- gsub("fam", "",  top$name_test)

top$ID <- paste0(top$ID, top$rank)
top$rank <- NULL
top <- as.data.frame(top)

toann <- TS6_d[!TS6_d$smallRNA %in% top$smallRNA,]
toann <- toann[!toann$smallRNA %in% known$smallRNA,]

toann_novel <- toann[is.na(toann$aae_smallRNA_family),]
toann_novel<- toann_novel %>% group_by(six_mer) %>% arrange(length) %>% mutate(rank = order(length))
toann_novel$rank <- chartr("123456789", "abcdefghi", toann_novel$rank)
toann_novel$rank2 <- toann_novel %>% group_indices(six_mer)

## Warning: group_indices_.grouped_df ignores extra arguments

toann_novel$ID <- paste0("aae-novel-", toann_novel$rank2, toann_novel$rank)
toann_novel$name_test <- paste0("aae-novel-", toann_novel$rank2)
toann_novel$rank <- NULL
toann_novel$rank2 <- NULL
toann_novel <- as.data.frame(toann_novel)

toann_rel <- toann[!toann$smallRNA %in% toann_novel$smallRNA,]
toann_rel$name_test <- str_split_fixed(toann_rel$aae_smallRNA_family, ":", 2)[,1]
toann_rel$name_test  <- sub("a$", "", toann_rel$name_test )
toann_rel$ID <- paste0(toann_rel$name_test, "-novel")
toann_rel$ID <- gsub("a-3p", "", toann_rel$ID)
toann_rel$ID <- gsub("-3p", "", toann_rel$ID)
toann_rel$ID <- gsub("-5p", "", toann_rel$ID)

toann_rel<- toann_rel %>% group_by(six_mer) %>% arrange(length) %>% mutate(rank = order(length))
toann_rel$rank <- chartr("123456789", "abcdefghi", toann_rel$rank)
toann_rel$ID <- paste0(toann_rel$ID, "-", toann_rel$rank)
toann_rel$rank <- NULL
toann_rel <- as.data.frame(toann_rel)

TS6 <- rbind(known, toann_rel, top, toann_novel)
TS6$aae_smallRNA_family <- TS6$name_test
TS6$name_test <- NULL
names(TS6)[names(TS6)=="length"] <- "smallRNA_length"
TS6$smallRNA <- TS6$ID
TS6$ID <- NULL
TS6$aae_smallRNA_family<- gsub("aae-novel-", "aae-novel-fam", TS6$aae_smallRNA_family)
TS6 <- TS6[,c(2:3, 41,4, 1, 5:40 )]

TS6$cqu_related_miRNA_family <- gsub(":", ";", TS6$cqu_related_miRNA_family )
TS6$aga_related_miRNA_family <- gsub(":", ";", TS6$aga_related_miRNA_family )
TS6$dme_related_miRNA_family <- gsub(":", ";", TS6$dme_related_miRNA_family )

TS6 <- TS6 %>% 
  replace_with_na_at(.vars = c("precursor_coordinate" ,"estimated_probability_smallRNA_is_true_positive" ),
                     condition = ~.x == "NA")

#write.table(TS6, "/Users/kathryn/Reprocess_all_paper_datasets/Supp_tables/Table_S3_Resource_paper.txt", col.names=TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

#write all unique filtered seeds by family name
seeds <- DNAStringSet(TS6$six_mer_target)
names(seeds) <- TS6$aae_smallRNA_family

seeds <- unique(seeds) 
#writeXStringSet(seeds, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/all_unique_filt_seeds_rn3.fa", format = "fasta")

#write out all filtered small RNA sequences by name
allfa <- DNAStringSet(TS6$sequence)
names(allfa) <- TS6$smallRNA
#writeXStringSet(allfa, "/Users/kathryn/Reprocess_all_paper_datasets/seed_search_refseqs/FINAL/all_unique_filt_sRNAs_rn.fa", format = "fasta")

output files “all_unique_filt_sRNAs_rn.fa” and “all_unique_filt_seeds_rn3.fa” are available on my Github

Table S4, “Table_S4.txt”, is available on my Github

rename fastas with final namess

these are the input fastas provided on my Github

#finally, rewrite DNAstringsets renamed
Dir <- "/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload"
fasta <- dir(Dir, pattern="*.fa",full.names = TRUE)

rename_fun <-  function(input, namedf) {
seqs <-  readDNAStringSet(input)
namedf$temp <- namedf$six_mer_target %in% seqs
tmp <- subset(namedf[,c("six_mer_target",  "aae_smallRNA_family")], namedf$temp == TRUE)
ss <- DNAStringSet(tmp$six_mer_target)
names(ss)  <-  tmp$aae_smallRNA_family
ss <- unique(ss)
outname = gsub(".fa", "_fam_rn.fa",  input)
writeXStringSet(ss, outname, format = "fasta")
}

#for (i in 1:length(fasta)) {
#  rename_fun(fasta[i], TS6)
#}

#just need to deal with top known because need to append if thay are "top" in cells, mosquitoes, or both
aaek <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/renamed_smallRNA_fastas/top10_aegypti_Ago1_known_fam_rn.fa")
cellk <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/renamed_smallRNA_fastas/top10_cells_Ago1_known_fam_rn.fa")
aaekdf <- as.data.frame(aaek)
aaekdf$aae_name <- row.names(aaekdf)
cellkdf <- as.data.frame(cellk)
cellkdf$cell_name <- row.names(cellkdf)
Ago1k  <- merge(aaekdf, cellkdf, by= "x", all  = TRUE)
Ago1k$newname <- ifelse(!is.na(Ago1k$aae_name) & !is.na(Ago1k$cell_name), paste0("top-both-", Ago1k$cell_name), NA)
Ago1k$newname <- ifelse(is.na(Ago1k$aae_name) & !is.na(Ago1k$cell_name), paste0("top-Aag2-", Ago1k$cell_name), paste0(Ago1k$newname))
Ago1k$newname <- ifelse(!is.na(Ago1k$aae_name) & is.na(Ago1k$cell_name), paste0("top-aegypti-", Ago1k$aae_name), paste0(Ago1k$newname))
bothk <- DNAStringSet(Ago1k$x)
names(bothk) <- Ago1k$newname

aaen <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/renamed_smallRNA_fastas/aegypti_Ago1_novel_fam_rn.fa")
celln <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/renamed_smallRNA_fastas/cells_Ago1_novel_fam_rn.fa")

alltog <- c(aaen, celln, bothk)
alltog <- unique(alltog)

#writeXStringSet(alltog,"/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/renamed_smallRNA_fastas/all_Ago1_known_novel_fam_rn.fa")