Introduction

The following analyses were used in Xu et al, eLife, 2023 (in press). They make use of CATaDa (i.e. Dam-only binding data) and Bsh TaDa binding data, combined with existing scRNA-seq data, to discover gene expression and motif enrichment associated with Bsh in L4 neurons.

Initial NGS data processing

NGS data were processed using damidseq_pipeline v1.5.3, and Dam-only tracks saved with the --coverage option.

Peaks were processed using find_peaks v1.0.3. For CaTaDa, parameters were --min_quant=70. For Bsh, default parameters were used. All replicates were processed separately.

Load libraries and define functions

# We're using pacman to make life easier ...
if (!require("pacman")) install.packages("pacman"); library(pacman,quietly = T)

## Loading required package: pacman

# Now we load the rest:
p_load(
  "tools",
  "parallel",
  "ggplot2",
  "stringr",
  "dplyr",
  "tidyr",
  "NOISeq",
  "GenomicRanges",
  "BSgenome",
  "preprocessCore",
  "BSgenome.Dmelanogaster.UCSC.dm6",
  "BioVenn"
)

theme_set(theme_bw())

curr.date = Sys.Date()
cutoff=0.85

read.gff = function (x,name="score") {
  fn.ext = file_ext(x)
  
  if (grepl("gff$",ignore.case=T,fn.ext)) {
    temp.data <- read.table(x,row.names=NULL)
    if (ncol(temp.data) > 5) {
      # GFF
      trim.data = temp.data[,c(1,4,5,6)]
    } else {
        cat("Error: file does not appear to be in GFF format\n\n")
        quit("no",1)
    }
  } else if (grepl("bedgraph$",ignore.case=T,fn.ext)) {
    temp.data = read.table(x,row.names=NULL,skip=1)
    if (ncol(temp.data) == 4) {
        # bedgraph
        trim.data = temp.data
    } else {
        cat("Error: file does not appear to be in bedGraph format\n\n")
        quit("no",1)
    }
  } else {
    cat("Error: input file does not appear to be in bedGraph or GFF format ...\n\n")
    quit("no",1)
  }
  
  names(trim.data) = c("chr","start","end",name)
  trim.data$chr = gsub("^chr","",trim.data$chr,perl=T)
  
  return(trim.data)
}

build.dataframes = function (bedgraphs) {
  cat("Building dataframes:\n")
  
  data.all = NULL
  for (i in 1:length(bedgraphs)) {
    file = bedgraphs[i]
    prot = regmatches(file,regexpr("(?<=/)(?!.*/).*?(?=\\.(gatc|average))",file,perl=T))
    cat("  loading",prot,"...\n")
    tempin = read.gff(file,tolower(prot))
    data.all = if (is.null(data.all)) tempin else merge(data.all,tempin,by=c("chr","start","end"))
  }
  
  # order by chr and fragment
  data.all = data.all[order(data.all$chr,data.all$start),]
  return(data.all)
}

find.overlap.sites = function (query, subject, maxgap=0) {
  subject[ 
    as.data.frame(findOverlaps(query, subject, maxgap = maxgap)) %>%
      dplyr::select(subjectHits) %>% 
      unlist(use.names = F)
    ]
}

all.overlaps.to.original = function (query, subject, maxgap=0) {
  ol = findOverlaps(query, subject, maxgap = maxgap)
  testl=list(); 
  null = lapply(
    queryHits(ol),
    FUN = function(x)testl[[x]]<<-""
    )
  null = apply(
    ol%>%as.data.frame,1,function(x){
      qh = as.numeric(unname(x[1])); sh = as.numeric(unname(x[2])); shg = genes$name[sh]; testl[[qh]] <<- if (testl[[qh]][1]=="") shg else c(testl[[qh]],shg) 
    }
  )
  outl = lapply(
    testl,
    function (x) {y = paste(sort(x),collapse=",")}
  )
  
  query$matches = outl %>% unlist
  #return(query)
  return(outl %>% unlist)
}

region.to.coords = function (x) {
  (regmatches(x,regexec("(.*?):(\\d+)-(\\d+)",x,perl=T)) %>% unlist())[2:4]
}

regions.to.gr = function (x) {
  GRanges(regions.to.df(x))
}

gr.occupancy = function (input.df, gr, buffer=0) {
  total = length(gr)
  gdf = as.data.frame(gr)
  names(gdf)[1]='chr'
  gdf$name = apply(gdf,1,function (x) sprintf("%s:%s-%s",x[1],as.numeric(x[2]),as.numeric(x[3])))
  
  genes = gdf[,c('chr','start','end','strand','name')]
  
  avg.exp = data.frame(input.df[1,c(4:(length(names(input.df))))])
  avg = vector(length=(length(names(input.df)) - 4))
  avg.exp = avg.exp[0,]
  
  ### Gene expression values ###  
  cat("Calculating gene values ...")
  
  count = 0
  
  # unroll chromosomes for speed:
  mc.out = mclapply(unique(genes$chr), mc.cores = global.mc.cores, function (chromo) {
    input.chr = subset(input.df, chr==chromo)
    genes.chr = subset(genes, chr==chromo)
    for (i in 1:length(genes.chr$name)) {
      # Roll through each gene
      
      gene.start = genes.chr[i,"start"] - buffer
      gene.end = genes.chr[i,"end"] + buffer
      
      gene.start = ifelse(gene.start < 1, 1, gene.start)
      
      # Create data frames for all gatc fragments covering current gene
      exp = data.frame(input.chr[ (input.chr$start <= gene.end) 
                                  & (input.chr$end >= gene.start) 
                                  ,] )
      
      gatc.num = length(exp[,1])
      
      # skip if no gatc fragments cover gene :(
      if (gatc.num == 0) {next}
      
      # trim to gene boundaries ...
      exp$start[1] = gene.start
      exp$end[length(exp[,1])] = gene.end
      
      # gene length covered by gatc fragments
      len = sum(exp$end-exp$start)
      
      # calculate weighted score for each column (representing different proteins)
      for (j in 4:length(names(input.chr))) {
        avg[j] = (sum((exp$end-exp$start)*exp[j]))/len
      }
      
      # make data.frame of averages (to be appended to avg.exp)
      df = cbind(avg[1])
      for (k in 2:length(avg)) {
        df = cbind(df,avg[k])
      }
      df = cbind(df,gatc.num)
      
      
      # append current gene to list
      avg.exp = rbind(avg.exp,data.frame(name=as.character(genes.chr[i,"name"]), df))
      count = count+1
      if (count %% 200 == 0) {cat(".")}
    }
    return(avg.exp)
  })
  
  cat("\n")
  
  for (i in 1:length(mc.out)) {
    avg.exp = rbind(avg.exp, mc.out[[i]])
  }
  
  avg.exp = avg.exp[,c(1,5:(length(names(avg.exp))))]
  names(avg.exp) = c("name",names(input.df)[c(4:(length(names(input.df))))],"gatc.num")
  #avg.exp = avg.exp[order(-avg.exp$rpii18),]
  
  return(avg.exp)
}

### Read in flybase genename lookup table:
load("data/fb.syn.d_2023_01.Rdata")

fb2gn2 = function (f) {
  out = vector()
  for (n in f) {
    x = unlist(gn.syn.d[n],use.names = F)
    o = if (is.null(x)) "unknown" else x
    out = c(out,o)
  }
  return(out)
}

# Gene definitions and GRanges object
p_load("TxDb.Dmelanogaster.UCSC.dm6.ensGene")
txdb = TxDb.Dmelanogaster.UCSC.dm6.ensGene
genes = genes(txdb)

##   1 gene was dropped because it has exons located on both strands of the
##   same reference sequence or on more than one reference sequence, so
##   cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.

seqlevels(genes)= levels(gsub("^chr","",seqnames(genes))) # hate levels so bad!
seqnames(genes) = gsub("^chr","",seqnames(genes))
genes$name = fb2gn2(genes$gene_id)


read.genes.gff = function (file) {
  # reads GFF, returns GRanges object
  if (grepl(".gz$",file)) {
    fc = gzfile(file,'rt')
  } else {
    fc = file
  }
  genes = read.table(fc, quote="\"",sep="\t",col.names = c("chr","source","feature","start","end","score","strand","phase","attr"))
  genes$name = genes$attr %>% str_extract("(?<=Name=).*?(?=;)")
  #genes = genes %>% filter(feature=="gene")
  gr = GRanges(genes)
  return(gr)
}

regions.to.df = function (x) {
  y = vapply(x, function (x) region.to.coords(x), FUN.VALUE = c(chr = "2L", start = 0, end = 0)) %>% t %>% as.data.frame()
  y$start = as.numeric(y$start)
  y$end = as.numeric(y$end)
  y$chr = sapply(y$chr,FUN = function (x) sprintf("chr%s",x),USE.NAMES = F)
  return(y)
}

reduce.regions = function (peaks) {
  pl = NULL
  for (f in peaks) {
    pt = read.genes.gff(f)
    pl = if (is.null(pl)) pt else c(pl,pt)
  }
  pr = reduce(pl)
  pr$id = apply(as.data.frame(pr),1,function (x) sprintf("%s:%s-%s",x[1],as.numeric(x[2]),as.numeric(x[3])))
  return(pr)
}

noiseq.fisher = function (out.volcano, matches, test.genes) {
  mynoiseq.all = out.volcano$noiseq
  red_points = rownames(mynoiseq.all)[rownames(mynoiseq.all) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",test),matches,perl = T,ignore.case = T)]]
  lrp = length(red_points)
  
  upreg = out.volcano$upreg
  downreg = out.volcano$downreg
  
  ldr = sum(red_points %in% downreg)
  lur = sum(red_points %in% upreg)
  
  fishertab = data.frame(row.names = c("L4exp","L4notexp"), up=c(lur,length(upreg)-lur), down=c(ldr,length(downreg)-ldr))
  
  print(fishertab)
  
  fisher.test(fishertab)
}

sci.num.l = function (x,y=NULL) {
  x = as.numeric(x)
  v = if (x<0.01) sprintf("%0.1e",x) else sprintf("%0.2f",x)
  ret = sprintf("italic(p)==%s",v)
  if (!(is.null(y))) {
    y = sprintf("%0.2f",as.numeric(y))
    ret = sprintf("%s~';'~odds==%s",ret,y)
  }
  cat(ret)
  return(ret)
}

addchr = function (x) {
  seqlevels(x) = gsub("^(.*)$", "chr\\1", seqlevels(x))
  return(x)
}


remchr = function (x) {
  seqlevels(x) = gsub("^chr", "", seqlevels(x))
  return(x)
}

plot_volcano = function (mydata,
                         cond=c("kdm5_140","wt"),
                         output.pdf = F,
                         highlight.genes=NULL,
                         highlight.genes2=NULL,
                         highlight.loci=NULL,
                         highlight.special.loci.up=NULL,
                         highlight.special.loci.down=NULL,
                         label.genes=NULL,
                         mynoiseq =  noiseq(mydata,conditions=cond,factor="Tissue",norm="uqua",replicates = "biological"),
                         plot.title = "Differentially expressed genes",
                         x.lab = paste("log(fold change (",cond[1],"/",cond[2],")",sep=""),
                         y.lab =bquote("-log(P)"),
                         theme.size = 18,
                         text.repel.size = 6,
                         label.points=T,
                         label.all=F,
                         custom.labels=NULL,
                         plot.subtitle=NULL,
                         max.overlaps=25,
                         plotname="",
                         plotdesc=plot.title,
                         sig.col = "orange",
                         highlight.col = "red",
                         highlight.col2="springgreen",
                         highlight.loci.col = "cyan",
                         highlight.special.loci.up.col="red",
                         highlight.special.loci.down.col="springgreen",
                         psize=1,
                         spsize=1,
                         lpsize=1,
                         hpsize=1,
                         palpha=0.1,
                         spalpha=0.2,
                         hpalpha=0.5,
                         lpalpha=1,
                         xlim=NULL,
                         ylim=NULL,
                         input=NULL,
                         goterms=F,
                         pdfw=2.7,
                         pdfh=3,
                         ft.alt="two.sided",
                         test.genes=NULL,
                         test.loci=NULL,
                         test.special.loci=NULL
                         ) {
  # grab the full dataset
  
  mynoiseq.all = mynoiseq@results[[1]]
  mynoiseq.all$log.rat = log(mynoiseq.all[[paste(gsub(pattern = "-",replacement = ".",x = cond[1]),"mean",sep="_")]]/mynoiseq.all[[paste(gsub(pattern = "-",replacement = ".",x = cond[2]),"mean",sep="_")]],base = 2)
  mynoiseq.all$log.prob = -log(1-mynoiseq.all$prob)
  
  # get differentially expressed genes
  mydeg = degenes(mynoiseq,q=cutoff,M=NULL)
  mydeg.up = degenes(mynoiseq,q=cutoff,M="up")
  mydeg.down = degenes(mynoiseq,q=cutoff,M="down")
  mydeg099 = degenes(mynoiseq,q=cutoff,M=NULL)
  
  # calculate means
  input[[cond[2]]] = log(rowMeans(input[,grep(cond[2],names(input))]))
  input[[cond[1]]] = log(rowMeans(input[,grep(cond[1],names(input))]))
  
  # volcano plot
  p = ggplot(mynoiseq.all,aes(log.rat,log.prob))+
    theme_bw(base_size=theme.size)+
    geom_point(color=rgb(0.4,0.4,0.4),alpha=palpha,size=psize,shape=16)+
    geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% rownames(mydeg))[,c("log.rat","log.prob")],
               color=sig.col,alpha=spalpha,size=psize,shape=16)+
    #geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% rownames(mydeg.up))[,c("log.rat","log.prob")],
               #color="orange",alpha=0.1,size=2)+
    
    #xlim(-2,3)+
    #ylim(0,7)+
    xlab(x.lab)+
    ylab(y.lab)+
    ggtitle(plot.title,plot.subtitle)
  
  if (!(is.null(xlim))) {
    p = p+
      xlim(xlim[1],xlim[2])
  }
  
  if (!(is.null(ylim))) {
    p = p+
      ylim(ylim[1],ylim[2])
  }
  
  # highlight loci
  if (!(is.null(highlight.loci))) {
    p=p+geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% highlight.loci)[,c("log.rat","log.prob")],color=highlight.loci.col,alpha=hpalpha,size=lpsize,shape=16)
  }
  
   # highlight genes
  if (!(is.null(highlight.genes))) {
    
    # special override:
    if (!(is.null(highlight.special.loci.up))) {
      p=p+geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% highlight.special.loci.up)[,c("log.rat","log.prob")],color=highlight.special.loci.up.col,alpha=lpalpha,size=hpsize,shape=16)
      if (!is.null(highlight.special.loci.down)) p=p+geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% highlight.special.loci.down)[,c("log.rat","log.prob")],color=highlight.special.loci.down.col,alpha=lpalpha,size=hpsize,shape=16)
      points.to.highlight=c(highlight.special.loci.up,highlight.special.loci.down)
    } else {
      points.to.highlight = rownames(mynoiseq.all)[rownames(mynoiseq.all) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",highlight.genes),custom.labels,perl = T,ignore.case = T)]]
        
    
    p=p+geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% points.to.highlight)[,c("log.rat","log.prob")],color=highlight.col,alpha=hpalpha,size=hpsize,shape=16)
    }
  }

  # second highlight group?
  if (!(is.null(highlight.genes2))) {
      points.to.highlight2 = rownames(mynoiseq.all)[rownames(mynoiseq.all) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",highlight.genes2),custom.labels,perl = T,ignore.case = T)]]
        
    
    p=p+geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% points.to.highlight2)[,c("log.rat","log.prob")],color=highlight.col2,alpha=hpalpha,size=spsize,shape=16)
  }
  
  if (label.points & !(is.null(custom.labels))) {
    points.to.label = if (!is.null(label.genes)) {
        if (label.all) {
          rownames(mynoiseq.all)[rownames(mynoiseq.all) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",label.genes),custom.labels,perl = T,ignore.case = T)]]
        } else {
          rownames(mydeg099)[rownames(mydeg099) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",label.genes),custom.labels,perl = T,ignore.case = T)]]
        }
      } else rownames(mydeg099)
    p = p + geom_text_repel(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% points.to.label)[,c("log.rat","log.prob")],
                   aes(label=custom.labels[rownames(mynoiseq.all) %in% points.to.label]),size=text.repel.size,
                   box.padding=0.3,
                   point.padding = 0.3,
                   label.padding=0.3,
                   min.segment.length = 0.1,
                   max.overlaps = max.overlaps
    )
    
    if (!is.null(label.genes)) {
      p = p + geom_point(data=subset(mynoiseq.all,rownames(mynoiseq.all) %in% points.to.label)[,c("log.rat","log.prob")],
                   color=highlight.col,alpha=hpalpha,size=1,,shape=16)
    }
  }
  
  upreg = rownames(degenes(mynoiseq,q=cutoff,M="up"))
  downreg = rownames(degenes(mynoiseq,q=cutoff,M="down"))
  
  ft = NULL
  fishertab = NULL
  red_points = NULL
  
  if (!(is.null(ft.alt))) {
    if (!is.null(test.special.loci)) {
      red_points=highlight.special.loci.up
    } else if (is.null(test.genes)) {
      red_points=highlight.loci
    } else {
      # red_points = rownames(mynoiseq.all)[rownames(mynoiseq.all) %in% rownames(mynoiseq.all)[grep(sprintf("(^|,)(%s)(,|$)",test.genes),input$matches,perl = T,ignore.case = T)]]
      
      red_points=points.to.highlight
      label.genes = input[input$name %in% red_points,"matches"]
    }
    
    if (!is.null(test.loci) & !is.null(test.genes)) {
      tup = highlight.loci[highlight.loci %in% upreg]
      tdown =  highlight.loci[highlight.loci %in% downreg]
    } else {
      tup = upreg
      tdown = downreg
    }
      
    lrp = length(red_points)
    
    if (!is.null(highlight.loci)) {
      overlap_points = red_points[red_points %in% highlight.loci]
      ldr = sum(overlap_points %in% tdown)
      lur = sum(overlap_points %in% tup)
      fishertab = data.frame(row.names = c("Bound","Unbound"), up=c(lur,length(tup)-lur), down=c(ldr,length(tdown)-ldr))
      
    } else {
  
      ldr = sum(red_points %in% tdown)
      lur = sum(red_points %in% tup)
      fishertab = data.frame(row.names = c("Bound","Unbound"), up=c(lur,length(tup)-lur), down=c(ldr,length(tdown)-ldr))
    }
      
    print(fishertab)
    
    ft = fisher.test(fishertab,alternative=ft.alt)
    p = p+annotate(geom = "text",label=sci.num.l(ft$p.value,ft$estimate),x=max(mynoiseq.all$log.rat)-3,y=max(mynoiseq.all$log.prob[!(is.infinite(mynoiseq.all$log.prob))])-4,parse=T)
  }
  
  if (output.pdf) pdf(sprintf("%s %s %s vs %s %s.pdf",plotname,plotdesc,cond[1],cond[2],curr.date),width=pdfw,height=pdfh)
  print(p)
  if (output.pdf) dev.off()
  
  if (output.pdf) {
    svg(sprintf("%s %s %s vs %s %s.svg",plotname,plotdesc,cond[1],cond[2],curr.date),width=pdfw,height=pdfh)
    print(p)
    dev.off()
  }
  
  return(list(upreg=upreg,downreg=downreg,noiseq=mynoiseq.all,plot=p,ft=ft,fishertab=fishertab,red_points=red_points))
}

region.venn = function (all,noiseq.out,filename) {
  shared= all$id[!(all$id %in% c(noiseq.out$upreg,noiseq.out$downreg))]
  catada_venn = draw.venn(c(noiseq.out$upreg,shared),c(noiseq.out$downreg,shared), list_z = NULL,xtitle = "L4",ytitle = "L5",title = "",subtitle = "",output = "pdf",filename = filename)
}

str.to.pwm = function (x,x1000=F) {
  z = str_split_1(x,"\n")
  pwm = vapply(z,function(x) {
    y = (regmatches(x,regexec("\\d\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)",text = x,perl = T)) %>% unlist())[2:5] %>% as.numeric()
    if (x1000) y = y * 1000
    print(y)
    print(sum(y))
    y=y
  }, FUN.VALUE = c(A=0,C=0,G=0,T=0)) %>% t
  
  mode(pwm)="integer"
  return(PWM(as.matrix(t(pwm)),type="prob"))
}

pwm.find = function (pwm, regions, refgenome=Dmelanogaster, score.cut="95%", max.mismatch=0, fixed.width=1000, tada.motif.fix=NULL, max.fix.width=T) {
  regions.gr = regions.to.gr(regions)
  if (!(is.null(fixed.width))) {
    if (max.fix.width) {
      for (i in 1:length(regions.gr)) {if (width(regions.gr[i,])>fixed.width) regions.gr[i,] = resize(regions.gr[i,],width=fixed.width,fix="center")}
    } else {
      regions.gr = resize(regions.gr,width=fixed.width,fix="center")
    }
  }
  seqs = getSeq(refgenome,regions.gr)
  seqs.pwm = sapply(seqs, function (x) {matchPWM(pwm,x,min.score = score.cut)}) %>% summary() %>% as.data.frame() %>% dplyr::rename(Freq.f = Freq) %>% filter(Var2=="Length")
  seqs.pwm$width = width(seqs)
  
  seqs.pwm.r = sapply(seqs, function (x) {matchPWM(reverseComplement(pwm),x,min.score = score.cut)}) %>% summary() %>% as.data.frame() %>% dplyr::rename(Freq.r = Freq) %>% filter(Var2=="Length")
  seqs.pwm.r$width = width(seqs)
  seqs.pwm.all = merge(seqs.pwm,seqs.pwm.r, by=c("Var1","Var2","width"))
  
  seqs.pwm.all$Freq.f = as.numeric(seqs.pwm.all$Freq.f)
  seqs.pwm.all$Freq.r = as.numeric(seqs.pwm.all$Freq.r)
  
  seqs.pwm.all$Freq = apply(as.matrix(seqs.pwm.all[,c("Freq.f","Freq.r")]),1,function(x) max(x))
  return(seqs.pwm.all)
}

Load up the data

files = c(Sys.glob("data/*ph"),Sys.glob("data/catada/*ph"))
data = build.dataframes(files)

## Building dataframes:
##   loading Bsh_Dam_L4_r1-ext300-vs-Dam.kde-norm ...
##   loading Bsh_Dam_L4_r2-ext300-vs-Dam.kde-norm ...
##   loading Bsh_Dam_L4_r3-ext300-vs-Dam.kde-norm ...
##   loading Bsh_Dam_L5_r1-ext300-vs-Dam.kde-norm ...
##   loading Bsh_Dam_L5_r2-ext300-vs-Dam.kde-norm ...
##   loading Bsh_Dam_L5_r3-ext300-vs-Dam.kde-norm ...
##   loading Dam_only_L4_r1-ext300 ...
##   loading Dam_only_L4_r2-ext300 ...
##   loading Dam_only_L4_r3-ext300 ...
##   loading Dam_only_L5_r1-ext300 ...
##   loading Dam_only_L5_r2-ext300 ...
##   loading Dam_only_L5_r3-ext300 ...

# quantile normalised dataset
bsh_qnorm = as.data.frame(normalize.quantiles(as.matrix(data[,grep("bsh_dam_l(4|5)_r",names(data),perl = T)])))
names(bsh_qnorm) = paste(names(data)[grep("bsh_dam_l(4|5)_r",names(data),perl = T)],"_qnorm",sep="")

# replicate means
bsh_qnorm$bsh_l4_qnorm_avg = apply(bsh_qnorm[,grep("bsh_dam_l4",names(bsh_qnorm),perl = T)],1,function (x) {mean(x)})
bsh_qnorm$bsh_l5_qnorm_avg = apply(bsh_qnorm[,grep("bsh_dam_l5",names(bsh_qnorm),perl = T)],1,function (x) {mean(x)})

data$bsh_l4_avg = apply(data[,grep("bsh_dam_l4",names(data),perl = T)],1,function (x) {mean(x)})
data$bsh_l5_avg = apply(data[,grep("bsh_dam_l5",names(data),perl = T)],1,function (x) {mean(x)})

data$catada_l4_avg = apply(data[,grep("dam_only_l4",names(data),perl = T)],1,function (x) {mean(x)})
data$catada_l5_avg = apply(data[,grep("dam_only_l5",names(data),perl = T)],1,function (x) {mean(x)})

# merged dataset
datam = cbind(data,bsh_qnorm)

# Load in scRNA-seq data
l4_spec_mar = read.csv("data/l4_specific_March.csv",header=T,row.names = 1)
names(l4_spec_mar)[1]="name"
l5_spec_mar = read.csv("data/l5_specific_March.csv",header=T,row.names = 1)
names(l5_spec_mar)[1]="name"

l4_rnaseq_spec_mar = capture.output(cat(l4_spec_mar$name,sep = "|"))
l5_rnaseq_spec_mar = capture.output(cat(l5_spec_mar$name,sep = "|"))

CaTaDa

Here, we read in the CaTaDa peaks for each replicate/sample. Peaks are reduced to a single merged dataset that considers a peak region found in any replicate. CaTaDa coverage over each peak region is calculated for each replicate.

peaks.catada = Sys.glob("data/catada/peak_analysis.Dam*/*gff")
pr.catada = reduce.regions(peaks.catada)

catada_data = data[,c(1:3,10:15)]

global.mc.cores=7
catada_peaks = gr.occupancy(catada_data,pr.catada)

## Calculating gene values ...

catada_peaks$matches = all.overlaps.to.original(pr.catada,genes,maxgap = 1000)

NOISeq to discover peaks with differential occupancy

We use the nonparametric differential expression analysis package NOISeq to call differentially accessible or bound regions from the data. We find that NOISeq works well on all forms of TaDa data (including RNA Polymerase occupancy, CATaDa accessibility data and transcription factor binding).

cutoff=0.85

input=catada_peaks
row.names(input) = catada_peaks$name
mycounts=input[,c(2:(ncol(input)-2))]

myfactors = data.frame(
  Tissue = regmatches(names(input),regexpr(".*?(?=_r)",names(input),perl=T)),
  TissueRun=regmatches(names(input),regexpr(".*?(?=-ext300)",names(input),perl=T))
)
mydata = readData(data=mycounts,factors=myfactors)

Figure 6B and Figure 6 – figure supplement 2B

The plots below show the differential enrichment of expressed genes with differentially-accessible regions of chromatin, between the L4 and L5 lineages.

# Volcano plots
out.catada_final = plot_volcano(mydata,cond=c("dam_only_l4","dam_only_l5"),output.pdf = F,plot.title = "Chromatin accessibility changes",custom.labels = input$matches,theme.size = 15,text.repel.size = 2,max.overlaps = 20,highlight.genes = l4_rnaseq_spec_mar, input=input,x.lab = "log(L4/L5)", plotname = "Figure_6B", label.all=T, plot.subtitle = "L4 specific genes highlighted", highlight.col = "red", highlight.col2 = "#00E0E0", label.points = F,psize=0.3,hpsize = 0.3,pdfw=3,pdfh=3,xlim=c(-5,5),ylim=c(0,8),hpalpha = 1,ft.alt = "greater", test.genes = T)

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
##           up down
## Bound     88   36
## Unbound 1268 1688
## italic(p)==5.5e-10~';'~odds==3.25

## Warning: Removed 15 rows containing missing values (`geom_point()`).
## Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

out.catada_final2 = plot_volcano(mydata,cond=c("dam_only_l4","dam_only_l5"),output.pdf = F,plot.title = "Chromatin accessibility changes",custom.labels = input$matches,theme.size = 15,text.repel.size = 2,max.overlaps = 20,highlight.genes = l5_rnaseq_spec_mar, input=input,x.lab = "log(L4/L5)", plotname = "Figure_6—figure supplement_2B", label.all=T, plot.subtitle = "L5 specific genes highlighted", highlight.col = "red", highlight.col2 = "#00E0E0", label.points = F,psize=0.3,hpsize=0.3,pdfw=3,pdfh=3,xlim=c(-5,5),ylim=c(0,8),hpalpha = 1,ft.alt = "less", test.genes = T)

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
##           up down
## Bound     15   45
## Unbound 1341 1679
## italic(p)==1.7e-03~';'~odds==0.42

## Warning: Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

# Venn diagram
region.venn(pr.catada,out.catada_final,"catada_venn.pdf")

## [1] "x total: 4826"
## [1] "y total: 5194"
## [1] "z total: 0"
## [1] "x only: 1356"
## [1] "y only: 1724"
## [1] "z only: 0"
## [1] "x-y total overlap: 3470"
## [1] "x-z total overlap: 0"
## [1] "y-z total overlap: 0"
## [1] "x-y only overlap: 3470"
## [1] "x-z only overlap: 0"
## [1] "y-z only overlap: 0"
## [1] "x-y-z overlap: 0"

Bsh binding

Now we take the Bsh peaks and perform the same analysis as with the CATaDa data above.

peaks.bsh = Sys.glob("data/peak_analysis*/*gff")
pr.bsh = reduce.regions(peaks.bsh)

bsh_qnorm_data = datam[,c(1:3,20:25)]

global.mc.cores=7
bsh_peaks = gr.occupancy(bsh_qnorm_data,pr.bsh)

## Calculating gene values ...

bsh_peaks$matches = all.overlaps.to.original(pr.bsh,genes,maxgap = 1000)

NOISeq to discover peaks with differential occupancy

input.bsh=bsh_peaks
input.bsh[,2:7]=2^input.bsh[,2:7]
row.names(input.bsh) = bsh_peaks$name
mycounts.bsh=input.bsh[,c(2:(ncol(input)-2))]

myfactors.bsh = data.frame(
  Tissue = regmatches(names(input.bsh),regexpr(".*?(?=_r)",names(input.bsh),perl=T)),
  TissueRun=regmatches(names(input.bsh),regexpr(".*?(?=-ext300)",names(input.bsh),perl=T))
)
mydata.bsh = readData(data=mycounts.bsh,factors=myfactors.bsh)

Figure 6C and Figure 6 – figure supplement 2C

These plots show the differential enrichment of expressed genes with differentially-bound Bsh peaks, between the L4 and L5 lineages.

# Volcano plots
# Figure 6C
out.bsh_final= plot_volcano(mydata.bsh,cond=c("bsh_dam_l4","bsh_dam_l5"),output.pdf = F,plot.title = "Bsh peaks",custom.labels = input.bsh$matches,theme.size = 16,text.repel.size = 2,max.overlaps = 30,highlight.genes  = l4_rnaseq_spec_mar,input=input.bsh,x.lab = "log2 FC (L4/L5)",plotname = "Figure_6C", plot.subtitle = "L4 specific genes highlighted", label.points = F, highlight.col = "red",psize=0.3,hpsize=0.3,pdfw=3,pdfh=3,xlim=c(-3,3),ylim=c(0,8.5),ft.alt = "greater",test.genes = T)

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "2440 differentially expressed features"
## [1] "856 differentially expressed features (up in first condition)"
## [1] "1584 differentially expressed features (down in first condition)"
## [1] "2440 differentially expressed features"
## [1] "856 differentially expressed features (up in first condition)"
## [1] "1584 differentially expressed features (down in first condition)"
##          up down
## Bound    63   27
## Unbound 793 1557
## italic(p)==6.7e-12~';'~odds==4.58

## Warning: Removed 18 rows containing missing values (`geom_point()`).
## Removed 18 rows containing missing values (`geom_point()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

# Figure 6 -- figure supplement 2C
out.bsh_final2= plot_volcano(mydata.bsh,cond=c("bsh_dam_l4","bsh_dam_l5"),output.pdf = F,plot.title = "Bsh peaks",custom.labels = input.bsh$matches,theme.size = 16,text.repel.size = 2,max.overlaps = 30,highlight.genes  = l5_rnaseq_spec_mar,input=input.bsh,x.lab = "log2 FC (L4/L5)",plotname = "Figure_6C-figure_supplement_2C", plot.subtitle = "L5 specific genes highlighted", label.points = F, highlight.col = "red",psize=0.3,hpsize=0.3,pdfw=3,pdfh=3,xlim=c(-3,3),ylim=c(0,8.5),ft.alt = "less",test.genes = T)

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "2440 differentially expressed features"
## [1] "856 differentially expressed features (up in first condition)"
## [1] "1584 differentially expressed features (down in first condition)"
## [1] "2440 differentially expressed features"
## [1] "856 differentially expressed features (up in first condition)"
## [1] "1584 differentially expressed features (down in first condition)"
##          up down
## Bound    26   23
## Unbound 830 1561
## italic(p)==1.00~';'~odds==2.13

## Warning: Removed 18 rows containing missing values (`geom_point()`).

## Warning: Removed 18 rows containing missing values (`geom_point()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

# Venn diagram, Figure 6 -- figure supplement 2C
region.venn(pr.bsh,out.bsh_final,sprintf("Figure_6-figure_supplement_2C_venn_%s.pdf",curr.date))

## [1] "x total: 5873"
## [1] "y total: 6601"
## [1] "z total: 0"
## [1] "x only: 856"
## [1] "y only: 1584"
## [1] "z only: 0"
## [1] "x-y total overlap: 5017"
## [1] "x-z total overlap: 0"
## [1] "y-z total overlap: 0"
## [1] "x-y only overlap: 5017"
## [1] "x-z only overlap: 0"
## [1] "y-z only overlap: 0"
## [1] "x-y-z overlap: 0"

Su(H) motif searches

To search for enrichment of the Su(H) binding motif, we use the motif as defined in the cisbp database.

motif.suh.cisbp = "7 0.0 0.0 0.5 0.5 G
8 0.0 0.0 0.0 1.0 T
9 0.0 0.0 0.0 1.0 T
10 0.0 1.0 0.0 0.0 C
11 0.0 0.7 0.0 0.3 C
12 0.0 1.0 0.0 0.0 C
13 1.0 0.0 0.0 0.0 A
14 0.0 1.0 0.0 0.0 C
15 0.6 0.0 0.4 0.0 A"
motif.suh.pwm = str.to.pwm(motif.suh.cisbp,x1000 = T)

## [1]   0   0 500 500
## [1] 1000
## [1]    0    0    0 1000
## [1] 1000
## [1]    0    0    0 1000
## [1] 1000
## [1]    0 1000    0    0
## [1] 1000
## [1]   0 700   0 300
## [1] 1000
## [1]    0 1000    0    0
## [1] 1000
## [1] 1000    0    0    0
## [1] 1000
## [1]    0 1000    0    0
## [1] 1000
## [1] 600   0 400   0
## [1] 1000

Figure 6 – figure supplement 2A

These figures show the enrichment of the Su(H) motif in differentially-accessible regions of open chromatin between L4 and L5 neurons. In one figure, regions are linked to differentially expressed genes.

# Find Su(H) motif
catada.suh.pwm = pwm.find(pwm = motif.suh.pwm, regions = pr.catada$id, score.cut = "85%", fixed.width=1000)
catada.suh.pwm.r = catada.suh.pwm %>% filter(Freq>0) %>% dplyr::select(Var1) %>% as.vector %>% unlist %>% unname
suh.bound.catada = find.overlap.sites(catada.suh.pwm.r %>% as.character %>% regions.to.gr %>% remchr,pr.catada,maxgap = 0)

# suh sites bounds and expressed
out.catada_bshl4 = plot_volcano(mydata,cond=c("dam_only_l4","dam_only_l5"),output.pdf = F,plot.title = "Chromatin accessibility changes",custom.labels = input$matches,theme.size = 15,text.repel.size = 2,max.overlaps = 20,highlight.genes = l4_rnaseq_spec_mar, input=input,x.lab = as.expression(log[2]~"(L4/L5)"), plotname = "Su(H) overlap with accessible chromatin and expressed genes", label.all=T, plot.subtitle = "Su(H) motifs highlighted", highlight.loci =suh.bound.catada$id,highlight.col = "red", highlight.loci.col = "cornflowerblue", label.points = F,psize=0.2,spsize=0.2,pdfw=3,pdfh=3,xlim=c(-5,5),ylim=c(0,8),hpalpha = 0.4,lpalpha=0.8,ft.alt = "greater", test.genes = l4_rnaseq_spec_mar, test.loci =1,lpsize=0.2,hpsize=0.5 )

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
##          up down
## Bound    36   10
## Unbound 578  620
## italic(p)==4.1e-05~';'~odds==3.86

## Warning: Removed 15 rows containing missing values (`geom_point()`).
## Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 3 rows containing missing values (`geom_point()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

out.catada_bshl4 = plot_volcano(mydata,cond=c("dam_only_l4","dam_only_l5"),output.pdf = F,plot.title = "Chromatin accessibility changes",custom.labels = input$matches,theme.size = 15,text.repel.size = 2,max.overlaps = 20,input=input,x.lab = as.expression(log[2]~"(L4/L5)"), plotname = "Figure_6-figure_supplement_2A", label.all=T, plot.subtitle = "Su(H) motifs highlighted", highlight.loci =suh.bound.catada$id,highlight.loci.col = "cornflowerblue", label.points = F,psize=0.2,spsize=0.2,pdfw=3,pdfh=3,xlim=c(-5,5),ylim=c(0,8),hpalpha = 0.4,lpalpha=0.8,ft.alt = "greater", test.genes = NULL, test.loci =1,lpsize=0.2,hpsize=0.5 )

## [1] "WARNING: Your experiment has biological replicates. You should consider using NOISeqBIO instead of NOISeq."
## [1] "Computing (M,D) values..."
## [1] "Computing probability of differential expression..."
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
## [1] "3080 differentially expressed features"
## [1] "1356 differentially expressed features (up in first condition)"
## [1] "1724 differentially expressed features (down in first condition)"
##          up down
## Bound   614  630
## Unbound 742 1094
## italic(p)==5.7e-07~';'~odds==1.44

## Warning: Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 15 rows containing missing values (`geom_point()`).

## Warning: Removed 3 rows containing missing values (`geom_point()`).

Bsh and CaTaDa in L4/L5 neurons

Introduction

Initial NGS data processing

Load libraries and define functions

Load up the data

CaTaDa

NOISeq to discover peaks with differential occupancy

Figure 6B and Figure 6 – figure supplement 2B

Bsh binding

NOISeq to discover peaks with differential occupancy

Figure 6C and Figure 6 – figure supplement 2C

Su(H) motif searches

Figure 6 – figure supplement 2A