Seed search functions

Additionally, see CLIPflexR documentation for:
fetchSequencesForCLIP
annotatePeaksWithPatterns

 fetchSequencesForClIP <- function(peaks,reSize=NULL,fasta,add5=0,add3=0){
  swd <- Rsamtools::FaFile(fasta)
  
  fastaLens <- swd %>%  seqlengths
  start(peaks) <- start(peaks)-add5
  end(peaks) <- end(peaks)+add3
  sees <- seqlevels(peaks)
  seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
  
  
  
  Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
  
  if(!is.null(reSize)){
    resizePeaks <- resize(peaks, reSize, fix="center", use.names=TRUE)
    
  }else{
    resizePeaks <- peaks
  }
  names(resizePeaks) <- paste0(seqnames(resizePeaks),":",start(resizePeaks),"-",end(resizePeaks))
  resizePeaks <- trim(resizePeaks)
  temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
  temp2 <- Rsamtools::getSeq(Rsamtools:::FaFile(fasta),resizePeaks[temp@from])
  names(temp2) <- names(resizePeaks[temp@from])
  temp2
}



annotatePeaksWithPatterns  <- function(peaks,fasta,patterns,resize=64,add5=0,add3=0,verbose=FALSE,checkReverse=TRUE){
  require(GenomicRanges)
  require(magrittr)
  require(Biostrings)
  require(Rsamtools)
  
  if(verbose) message("Reading in peaks....",appendLF = FALSE)
  if(is(peaks,"GRanges")){
    peaks <- peaks
    peaks$originalPeak <-  paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks))
  }else if(is(peaks,"character")){
    if(!file.exists(peaks))stop(paste0("The file ",peaks," does not exist"))
    peaks <- read.table(peaks, header = TRUE)
    peaks <- makeGRangesFromDataFrame(peaks,
                                      keep.extra.columns=TRUE,
                                      ignore.strand=FALSE,
                                      seqinfo=NULL,
                                      seqnames.field="seqnames",
                                      start.field="start",
                                      end.field="end",
                                      strand.field="strand",
                                      starts.in.df.are.0based=FALSE)
    peaks$originalPeak <-  paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), ":", strand(peaks))
    
  }
  if(verbose) message("...done")
  if(verbose) message("Read in ",length(peaks)," peaks")
  
  
  if(!file.exists(fasta))stop(paste0("The file ",fasta," does not exist"))
  if(verbose) message("Indexing FASTA...",appendLF = FALSE)
  indexFa(fasta)
  if(verbose) message("...done")
  
  
  if(verbose) message("Aligning seqlevels across peaks and FASTA...",appendLF = FALSE)
  fastaLens <- FaFile(fasta) %>% seqlengths
  sees <- seqlevels(peaks)
  seqlengths(peaks) <- fastaLens[match(sees,names(fastaLens),incomparables = 0)]
  if(verbose) message("...done")
  
  if(verbose) message("Removing invalid peaks which are outside contig boundaries...",appendLF = FALSE)
  Boundaries <- GRanges(seqlevels(peaks),IRanges(1,seqlengths(peaks)))
  resizePeaks <- resize(peaks, resize, fix="center", use.names=TRUE)
  resizePeaks <- trim(resizePeaks)
  names(resizePeaks) <- paste0(seqnames(peaks),":",start(peaks),"_",end(peaks), "_", strand(peaks)) 
  temp <- findOverlaps(resizePeaks,Boundaries,type=c("within"))
  validPeaks <- resizePeaks[temp@from]
  validPeaks$extendedPeak <- paste0(seqnames(validPeaks),":",start(validPeaks),"_",end(validPeaks), "_", strand(validPeaks))
  if(verbose) message("...done")
  if(verbose) message("Removed in ",length(validPeaks)-length(resizePeaks)," peaks")
  
  if(verbose) message("Retrieving sequence from under peaks....",appendLF = FALSE)
  myRes <- getSeq(Rsamtools:::FaFile(fasta),validPeaks)
  
  names(myRes) <- validPeaks$extendedPeak
  
  validPeaks$seqUnderExtendedPeak <- myRes
  if(verbose) message("...done")
  
  if(verbose) message("Retrieving patterns to search for....",appendLF = FALSE)
  if(length(patterns) > 1){
    pattern=patterns
  }else{
    motifSeq <- readDNAStringSet(patterns)
    pattern <- as.character(motifSeq)
  }
  if(verbose) message("...done")
  if(verbose) message("Read in ",length(pattern)," patterns")
  
  if(verbose) message("Searching for ",length(pattern)," patterns in peaks....")
  mergePeaksAndSites <- patternCallList <- list()
  for(i in 1:length(pattern)){
    if(verbose) message("Search for ",pattern[i])
    peaks_Sites <- validPeaks
    fixedInRegion <- vmatchPattern(pattern=pattern[i],subject=myRes) %>% unlist()
    
    
    startOfPmatch <- resize(fixedInRegion,width=1,fix="start") %>% unname %>% as.character
    pos <- matrix(unlist(strsplit(gsub(".*:","",names(fixedInRegion)),"_")),ncol=3,byrow =T)
    seq <- gsub(":.*","",names(fixedInRegion))
    
    sitesGRpos <- GRanges(seq[pos[,3] =="+"],
                          IRanges(as.numeric(pos[pos[,3] =="+",1])+start(fixedInRegion[pos[,3] =="+"])-1,
                                  as.numeric(pos[pos[,3] =="+",1])+end(fixedInRegion[pos[,3] =="+"])-1),startOfPmatch=startOfPmatch[pos[,3] =="+"])
    sitesGRpos$PeakID <- names(fixedInRegion[pos[,3] =="+"])
    
    sitesGRneg <- GRanges(seq[pos[,3] =="-"],
                          IRanges(as.numeric(pos[pos[,3] =="-",2])-end(fixedInRegion[pos[,3] =="-"])+1,
                                  as.numeric(pos[pos[,3] =="-",2])-start(fixedInRegion[pos[,3] =="-"])+1),startOfPmatch=startOfPmatch[pos[,3] =="-"])
    sitesGRneg$PeakID <- names(fixedInRegion[pos[,3] =="-"])
    
      sitesFA <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta),reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta)))
      sitesFAExtended <- c(fetchSequencesForClIP(sitesGRpos,reSize=NULL,fasta,add5=add5,add3=add3),
                           reverseComplement(fetchSequencesForClIP(sitesGRneg,reSize=NULL,fasta,add5=add5,add3=add3))
      )
      sitesGR <- suppressWarnings(c(sitesGRpos,sitesGRneg))

    sitesGR$Seq <- sitesFA
    sitesGR$SeqExtended <- sitesFAExtended
    sitesGR$siteOfPattern <- granges(sitesGR) %>% unname %>% as.character
    sitesGR$centerOfPattern <-  resize(sitesGR,width=1,fix="center") %>% unname %>% as.character
    dfSitesGR <- as.data.frame(sitesGR)
    peaks_SitesDF <- as.data.frame(peaks_Sites)
    
    mergePeaksAndSites[[i]] <- merge(peaks_SitesDF,dfSitesGR,by.x="extendedPeak",by.y="PeakID",all=FALSE)
    patternCallList[[i]] <- mergePeaksAndSites[[i]] %>% dplyr::select(Seq,SeqExtended,startOfPmatch,centerOfPattern,siteOfPattern,originalPeak,extendedPeak)
    rm(peaks_SitesDF)
    rm(peaks_Sites)
  }
  if(verbose) message("....finished search")
  names(mergePeaksAndSites) <- names(patternCallList) <- as.character(pattern)
  return(list(mergePeaksAndSites,patternCallList))
}

Get peak ranges and control ranges

read in all peaks from matrix building and PCA filtering output; input file “Final_matrix_all_peaks_add_filtering.txt” is peak matrix with all sample and filtering information, generated in PCA_annotation_fgsea_targetvenns script, and available in my Github

all <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/Final_matrix_all_peaks_add_filtering.txt", sep = "\t", header = TRUE)

#make GR for seed search
allGR <- makeGRangesFromDataFrame(all,
                                     keep.extra.columns=TRUE,
                                     ignore.strand=FALSE,
                                     seqinfo=NULL,
                                     seqnames.field="seqnames",
                                     start.field="start",
                                     end.field="end",
                                     strand.field="strand",
                                     starts.in.df.are.0based=FALSE)

Search for all top most abundant esiRNAs in AGO2 peaks

“all_Ago2_known_novel_fam_rn.fa” file with top AGO2 small RNA 6mer target sequences was generated in smallRNA_abundance_scatterplots script and is available in my Github

“all_putative_known_sRNA_seeds2.txt” file was generated in the mirdeep2_processing_filtering script
“Table_S4.txt” file was generated in the smallRNA_abundance_scatterplots script
These files are both available on my Github

“Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa” is available for download at Vectorbase

Additionally, see CLIPflexR documentation for:
annotatePeaksWithPatterns

res_Ago2_FL <- annotatePeaksWithPatterns(peaks=allGR,
fasta="/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa", patterns="/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/all_Ago2_known_novel_fam_rn.fa",
                                  resize=406,
                                  add5=11,
                                  add3=1,
                                  verbose=TRUE)
## Loading required package: Rsamtools
## Reading in peaks.......done
## Read in 98398 peaks
## Indexing FASTA......done
## Aligning seqlevels across peaks and FASTA......done
## Removing invalid peaks which are outside contig boundaries...
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 64 out-of-bound ranges located on
##   sequences MT, NIGP01000015, NIGP01000098, NIGP01000158,
##   NIGP01000162, NIGP01000179, NIGP01000252, NIGP01000253,
##   NIGP01000530, NIGP01000588, NIGP01000599, NIGP01000607,
##   NIGP01000720, NIGP01000792, NIGP01001064, NIGP01001101,
##   NIGP01001159, NIGP01001185, NIGP01001254, NIGP01001267,
##   NIGP01001282, NIGP01001308, NIGP01001315, NIGP01001383,
##   NIGP01001658, NIGP01001771, NIGP01001804, NIGP01001829,
##   NIGP01001859, NIGP01001921, NIGP01001990, NIGP01002244,
##   NIGP01002255, and NIGP01002260. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence
##   are not considered out-of-bound (use seqlengths() and isCircular()
##   to get the lengths and circularity flags of the underlying
##   sequences). You can use trim() to trim these ranges. See
##   ?`trim,GenomicRanges-method` for more information.
## ...done
## Removed in 0 peaks
## Retrieving sequence from under peaks.......done
## Retrieving patterns to search for.......done
## Read in 25 patterns
## Searching for 25 patterns in peaks....
## Search for GCATGA
## Search for TACCGC
## Search for TGGTAA
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
##   NIGP01000599. Note that ranges located on a sequence whose length
##   is unknown (NA) or on a circular sequence are not considered
##   out-of-bound (use seqlengths() and isCircular() to get the lengths
##   and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
##   more information.
## Search for AAATCC
## Search for ACCGCC
## Search for ACCGGA
## Search for ATGGTA
## Search for GTAGGA
## Search for GTGGGA
## Search for ATGGAC
## Search for CCAGGA
## Search for GACCTA
## Search for CAATCC
## Search for CCAATC
## Search for CCTCTA
## Search for CGGAAT
## Search for CTGGGA
## Search for TGGTCA
## Search for AATCCA
## Search for TCCAAA
## Search for ATCCAA
## Search for CAAATC
## Search for TGGATT
## Search for ACCCAA
## Search for ATGTCG
## ....finished search
res_Ago2_FL <-res_Ago2_FL[[1]]
res_Ago2_FL <- rbindlist(res_Ago2_FL)
res_Ago2_FL$startOfPmatch <- as.numeric(res_Ago2_FL$startOfPmatch)
res_Ago2_FL$group <- paste0(res_Ago2_FL$SeqExtended, "_", res_Ago2_FL$siteOfPattern)
res_Ago2_FL$startOfPmatch <- as.numeric(res_Ago2_FL$startOfPmatch)
res_Ago2_FL$filt <- duplicated(res_Ago2_FL$group)
res_Ago2_FL <- subset(res_Ago2_FL, res_Ago2_FL$filt == FALSE)

#now need to select perfect, one mis, or 6mer matches of top esiRNAs

seeds_df <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Mol_cell_resub/Revision_files/Supplementary_Tables/Table_S4.txt", header=TRUE, sep="\t")

seed_table <- read.delim("/Users/kathryn/Reprocess_all_paper_datasets/Supp_Figs/all_putative_known_sRNA_seeds2.txt", header = TRUE, sep ="\t") #913 unique seed combos

#link to new names
seed_table <- merge(seeds_df[,c(1:2,38)], seed_table,by.x = "smallRNA_sequence" , by.y = "FL", all.x= TRUE )

Ago2_sRNAs <- readDNAStringSet("/Users/kathryn/Reprocess_all_paper_datasets/Rmds_to_upload/To_upload/all_Ago2_known_novel_fam_rn.fa", format = "fasta")
Ago2_sRNAs_cells <- Ago2_sRNAs[grepl("common-Ago2|Aag2-Ago2|Aag2-Ago2|aae-miR", names(Ago2_sRNAs))]
Ago2_sRNAs_aegypti <- Ago2_sRNAs[grepl("common-Ago2|aegypti-Ago2|aae-miR", names(Ago2_sRNAs))]

seed_table$Ago2_top <- seed_table$six_mer_target  %in% Ago2_sRNAs
FL <- DNAStringSet(seed_table$smallRNA_sequence)
names(FL) <- seed_table$smallRNA
FL <- reverseComplement(FL)
FL_df <- as.data.frame(FL)
FL_df$smallRNA <- row.names(FL_df)
FL_df <- setNames(FL_df, c("FL_target_seq", "smallRNA"))

seed_table <- merge(seed_table, FL_df, by= "smallRNA") 
Ago2_tom <- seed_table[seed_table$Ago2_top==TRUE,]
Ago2_tom <- Ago2_tom%>% group_by(.dots=c("six_mer_target", "six_mer", "aae_smallRNA_family")) %>% summarise(smallRNAs = paste0(smallRNA, collapse = ":"),  FL_target_seqs = paste0(FL_target_seq, collapse = ":"))

res_Ago2_FL <- merge(res_Ago2_FL, Ago2_tom[,c(1,3,5)], by.x= "Seq", by.y = "six_mer_target",all.x = TRUE )

res_Ago2_FL$check <- mapply(grepl, pattern=res_Ago2_FL$SeqExtended, x=res_Ago2_FL$FL_target_seqs)
res_Ago2_FL$perfect_18mer_target <- ifelse(res_Ago2_FL$check==TRUE, paste0(res_Ago2_FL$aae_smallRNA_family), paste0("NA"))

res_Ago2_FL$check <- mapply(agrepl, pattern=res_Ago2_FL$SeqExtended, x=res_Ago2_FL$FL_target_seqs, max=1)
res_Ago2_FL$one_mis_18mer_target <- ifelse(res_Ago2_FL$check==TRUE, paste0(res_Ago2_FL$aae_smallRNA_family), paste0("NA"))

cells_Ago2_6mer <- subset(res_Ago2_FL,res_Ago2_FL$Aag2_Ago2_BCsample > 0 & res_Ago2_FL$Aag2_Ago1_BCsample ==0)

cells_Ago2perf <- cells_Ago2_6mer[grepl("aae|common-AGO2|Aag2-AGO2", cells_Ago2_6mer$perfect_18mer_target),]
cells_Ago2perf_graph <- cells_Ago2perf$startOfPmatch

cells_Ago2one_mis <- cells_Ago2_6mer[grepl("aae|common-AGO2|Aag2-AGO2", cells_Ago2_6mer$one_mis_18mer_target),]
cells_Ago2one_mis_graph <- subset(cells_Ago2one_mis$startOfPmatch, cells_Ago2one_mis$perfect_18mer_target=="NA")


cells_Ago2_6mer_graph <- subset(cells_Ago2_6mer$startOfPmatch, cells_Ago2_6mer$one_mis_18mer_target=="NA")

aegypti_Ago2_6mer <- subset(res_Ago2_FL,res_Ago2_FL$aegypti_Ago2_BCsample > 0 & res_Ago2_FL$aegypti_Ago1_BCsample ==0)


aegypti_Ago2perf <- aegypti_Ago2_6mer[grepl("aae|common-AGO2|aegypti-AGO2", aegypti_Ago2_6mer$perfect_18mer_target),]
aegypti_Ago2perf_graph <- aegypti_Ago2perf$startOfPmatch

aegypti_Ago2one_mis <- aegypti_Ago2_6mer[grepl("aae|common-AGO2|aegypti-AGO2", aegypti_Ago2_6mer$one_mis_18mer_target),]
aegypti_Ago2one_mis_graph <- subset(aegypti_Ago2one_mis$startOfPmatch, aegypti_Ago2one_mis$perfect_18mer_target=="NA")


aegypti_Ago2_6mer_graph <- subset(aegypti_Ago2_6mer$startOfPmatch, aegypti_Ago2_6mer$one_mis_18mer_target=="NA")

Grab all numeric vectors containing the pattern match positions by target type to graph; Figures 6F and S6A

Ago2_perf <- grep("_graph", names(.GlobalEnv),value=TRUE)
Ago2_perf <- grep("BCsample", Ago2_perf ,value=TRUE, invert = TRUE)

Ago2_perf   <- do.call("list",mget(Ago2_perf  ))
Ago2_perf   <- lapply(Ago2_perf  , as.data.frame)
Ago2_perf   <- lapply(Ago2_perf  , setNames, nm="position")
Ago2_perf_counts <- lapply(Ago2_perf  , function(x) x %>% group_by(position) %>% summarize(count=n()))
Ago2_perf_counts <- lapply(Ago2_perf_counts, function(x) subset(x, x$position <= 320 & x$position >=80))

Ago2_perf <- rbindlist(Ago2_perf_counts, idcol=TRUE)
Ago2_perf <- setNames(Ago2_perf, c("sample", "position", "count"))
Ago2_perf  <- Ago2_perf %>% group_by(sample) %>% mutate(freq = count/(sum(count)))

        
aegyptiAgo2_perf <- Ago2_perf[grepl( "aegypti", Ago2_perf$sample),]
cellsAgo2_perf <- Ago2_perf[grepl( "cells", Ago2_perf$sample),]


ggplot(data= subset(aegyptiAgo2_perf, aegyptiAgo2_perf$sample=="aegypti_Ago2perf_graph" | aegyptiAgo2_perf$sample=="aegypti_Ago2_6mer_graph" | aegyptiAgo2_perf$sample=="aegypti_Ago2one_mis_graph"), aes(x=position, y=freq, color=sample)) +
    geom_smooth(method = lm, formula = y ~ splines::bs(x, 20), se = FALSE)  + xlim(80,320) + theme_bw()

ggplot(data= subset(cellsAgo2_perf, cellsAgo2_perf$sample=="cells_Ago2perf_graph" | cellsAgo2_perf$sample=="cells_Ago2_6mer_graph" | cellsAgo2_perf$sample=="cells_Ago2one_mis_graph"), aes(x=position, y=freq, color=sample)) +
    geom_smooth(method = lm, formula = y ~ splines::bs(x, 20), se = FALSE)  + xlim(80,320) + theme_bw()

Count number of perfect, 0-mismatch 18mer targets from the same family by peak; Figure S6F

cells_hist <- cells_Ago2perf %>% group_by(.dots=c("extendedPeak","perfect_18mer_target")) %>% summarise(count = n()) %>% mutate(sample = c("Aag2"))
mosq_hist <- aegypti_Ago2perf %>% group_by(.dots=c("extendedPeak","perfect_18mer_target")) %>% summarise(count = n()) %>% mutate(sample = c("aegypti"))
perf_hist <- rbind(cells_hist, mosq_hist) #this gives the # of perfect matches found in each peak for each smallRNA family
perf_hist2 <- perf_hist %>% group_by(.dots=c("count", "sample")) %>% mutate(peakcount = n())

unique_hist <- distinct(perf_hist2, peakcount, count, sample, .keep_all = TRUE)

unique_hist_perc <- unique_hist %>% group_by(sample) %>% mutate(percent_of_peaks = peakcount/(sum(peakcount))) 

ggplot(unique_hist_perc, aes(x=count, y = percent_of_peaks,color = sample)) +
  geom_bar(stat = "identity",position = position_dodge2(preserve = "single"), width=0.5, fill ="white") + scale_colour_manual(values = c("#FA0F0C", "#8A0F09")) + theme_bw()

#make bins for figure to reduce information
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count==1, paste0("1"), NA)
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 1 & unique_hist_perc$count <= 10, paste0("2-10"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 10 & unique_hist_perc$count <= 20, paste0("11-20"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 20 & unique_hist_perc$count <= 30, paste0("21-30"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 30 & unique_hist_perc$count <= 40, paste0("31-40"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 40 & unique_hist_perc$count <= 50, paste0("41-50"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 50 & unique_hist_perc$count <=60, paste0("51-60"), paste0(unique_hist_perc$count_bin ))
unique_hist_perc$count_bin <- ifelse(unique_hist_perc$count > 60 & unique_hist_perc$count <=70, paste0("61-70"), paste0(unique_hist_perc$count_bin ))

unique_hist_percg <- unique_hist_perc %>% group_by(.dots=c("count_bin", "sample")) %>% summarise(percent=sum(percent_of_peaks))

ggplot(unique_hist_percg, aes(x=count_bin, y = percent,fill = sample)) +
  geom_bar(stat = "identity",position = position_dodge2(preserve = "single"), width=0.8) + scale_fill_manual(values = c("#8A0F09", "#FA0F0C")) + theme_bw()

Get seqeunces for motif analysis (Figures 6G and S6G)

Input position weight matrix files were generated using DREME (“PWM1.txt” and “PWM2.txt”) are both available on my Github

#as fasta 
fasta <- "/Users/kathryn/mirdeep2_master/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5_fixed.fa"
genome <- Rsamtools::getSeq(Rsamtools:::FaFile(fasta))

allperfect <- rbind(cells_Ago2perf, aegypti_Ago2perf)

perfGR <-  GRanges(seqnames = allperfect$seqnames.x, strand = allperfect$strand.x,
                  ranges = IRanges(start = allperfect$start.x, end = allperfect$end.x))
perfGR <- unique(perfGR)

#get center of peaks versus flanking regions  
perf_center <- resize(perfGR, width = 40, fix = "center")
flanks_center_ups <- flank(perf_center, width= 40, start = TRUE)
flanks_center_dns <- flank(perf_center, width= 40, start = FALSE)
flanks_center <- c(flanks_center_ups, flanks_center_dns)

center <- genome[perf_center]
names(center) <- paste0(perf_center@seqnames, ":", perf_center@ranges, ":", perf_center@strand)
flank <- genome[flanks_center]
names(flank) <- paste0(flanks_center@seqnames, ":", flanks_center@ranges, ":", flanks_center@strand) 

#writeXStringSet(center, "/Users/kathryn/Reprocess_all_paper_datasets/specificity_motifs/center_mod_filt.fa")
#writeXStringSet(flank, "/Users/kathryn/Reprocess_all_paper_datasets/specificity_motifs/flanks_mod_filt.fa")

#ran MEME on center versus flanking sequences and got position weight matrix:

PWM1 <- read.table("/Users/kathryn/Reprocess_all_paper_datasets/specificity_motifs/PWM1.txt")
names(PWM1) <- c("A", "C", "G", "U")
PWM1 <- t(as.matrix(PWM1))
ggseqlogo( PWM1)

PWM2 <- read.table("/Users/kathryn/Reprocess_all_paper_datasets/specificity_motifs/PWM2.txt")
names(PWM2) <- c("A", "C", "G", "U")
PWM2 <- t(as.matrix(PWM2))
ggseqlogo( PWM2)