Transcription Factor Enrichment Analysis

This script performs transcription factor analyses using TFEA.ChIP and Enrichr.

true

Dependencies

Load requisite packages. This script uses my personal utilities package brainstorm, which can be downloaded via devtools::install_github("ayushnoori/brainstorm").

# data manipulation
library(data.table)
library(purrr)
library(magrittr)

# TFEA package
library(TFEA.ChIP)

# access Enrichr API
library(httr)
library(rjson)

# Excel manipulation
library(openxlsx)

# string manipulation
library(stringr)

# data visualization
library(ggplot2)
library(ggpubr)
library(plotly)
library(htmlwidgets)
library(RColorBrewer)

# utility functions
library(brainstorm)

Note that directories are relative to the R project path.

# set directories
ddir = file.path("Data", "3 - Transcription Factor Enrichment Analysis")
dir3 = file.path("Results", "3 - Transcription Factor Enrichment Analysis")

Read Data

Read ADRA protein set and load ChIP-Seq experiment data [14].

# read data
dat = fread(file.path("Data", "ADRA Protein Set.csv"))

# load ChIP-Seq experiment data
load(file.path(ddir, "ReMap+GH_doubleElite.Rdata"))
set_user_data(MetaData, Mat01)

TFEA.ChIP Analysis

Perform Transcription Factor Enrichment Analysis (TFEA) using the TFEA.ChIP package [5]. As a control list is not provided, use all human genes NOT in the ADRA marker set as the control.

# convert ADRA gene symbols to ENTREZ
entrez = GeneID2entrez(gene.IDs = dat[, Symbol])

# compute contingency matrices
CM = contingency_matrix(entrez)
pval = getCMstats(CM) %>% as.data.table()

# compute confidence intervals
CI = map(CM, ~fisher.test(.x, conf.int = TRUE, conf.level = 0.95)$conf.int) %>%
  { data.table(Accession = names(.), CI.left = map(., 1), CI.right = map(., 2)) }

# merge with confidence intervals
pval = merge(pval, CI, by = "Accession", all.x = TRUE)[order(-abs(distance)), ]

# write to Excel file
write.xlsx(pval, file.path(dir3, "TFEA Association Test.xlsx"),
           asTable = TRUE, tableStyle = "TableStyleMedium19")

TFEA.ChIP Volcano Plot

Select transcription factors to highlight and define colors.

# identify top TFs
topTFs = summary(as.factor(pval[1:200, TF]))
print(topTFs[order(-topTFs)][1:5])

# select top TFs
TF = c("CTCF", "ESR1")
names(TF) = TF

# select plotting colors
TFcol = c("#00FFD9", "#FF006F")

Create interactive volcano plot of transcription factor enrichment analysis results.

# create volcano plot
CMplot = plot_CM(pval, plot_title = "", specialTF = TF, TF_colors = TFcol)

# create axis sequences
xseq = pval[, log10.adj.pVal] %>% { seq(min(., na.rm = TRUE), max(., na.rm = TRUE), by = 0.1) }
yseq = pval[, log2.OR] %>% { seq(min(., na.rm = TRUE), max(., na.rm = TRUE), by = 0.1) }

# edit volcano plot
CMplot = CMplot %>%
  
  # add p-value threshold line
  add_lines(x = -log10(0.05), y = yseq,
            line = list(dash = "dash", width = 1.5, color = "red"),
            inherit = FALSE, hoverinfo = "none", name = "P-Value < 0.05",
            visible = "legendonly") %>%
  
  # add OR threshold lines
  add_lines(x = xseq, y = log2(1.5),
            line = list(dash = "dash", width = 1.5, color = "blue"),
            inherit = FALSE, hoverinfo = "none", name = "OR > 1.5",
            visible = "legendonly") %>%
  add_lines(x = xseq, y = -log2(1.5),
            line = list(dash = "dash", width = 1.5, color = "blue"),
            inherit = FALSE, hoverinfo = "none", name = "OR < -1.5",
            visible = "legendonly") %>%
  
  # add axis labels
layout(xaxis = list(title = "-Log<sub>10</sub> Adjusted P-Value"),
       yaxis = list(title = "Log<sub>2</sub> Odds Ratio"))  %>%
  
  # configure SVG output
config(toImageButtonOptions = list(format = "svg", filename = "TFEA Plot SVG",
                                   width = 2000, height = 1200))

# save figure
htmlwidgets::saveWidget(CMplot, file.path(dir3, "TFEA Plot.html"))
saveRDS(CMplot, file.path(dir3, "TFEA Plotly Object.rds"))

Create enhanced, static volcano plot of transcription factor enrichment analysis results.

# prepare data
pval_vp = copy(pval) %>%
  .[, Highlight := TF] %>%
  .[!(Highlight %in% c("CTCF", "ESR1")), Highlight := "Other"]

# plot data
vp = ggplot(pval_vp, aes(x = log2.OR, y = log10.adj.pVal, fill = Highlight)) +
  geom_hline(yintercept = 0, color = "black", size = 0.3) +
  geom_vline(xintercept = -1.99, color = "black", size = 0.3) +
  geom_point(alpha = 0.3, size = 1.2, shape = 21, stroke = 0) +
  scale_fill_manual("", values = c("#01CB5F", "#F17F29", "#687278")) +
  geom_point(data = pval[TF == "CTCF"], fill = "#01CB5F", size = 1.2, shape = 21, stroke = 0) +
  geom_point(data = pval[TF == "ESR1"], fill = "#F17F29", size = 1.2, shape = 21, stroke = 0) +
  geom_hline(yintercept = -log10(0.05), linetype="dashed", color = "#EF476F", size = 0.3) +
  geom_vline(xintercept = -log2(1.5), linetype="dashed", color = "#3066BE", size = 0.3) +
  geom_vline(xintercept = log2(1.5), linetype="dashed", color = "#3066BE", size = 0.3) +
  coord_cartesian(xlim = c(-1.99, 1.99), ylim = c(-0.01, 1.99), expand = FALSE) +
  xlab(bquote(bold(log[2]*' Fold-Change'))) +
  ylab(bquote(bold(-log[10]*' FDR '*bolditalic(p)*'-value'))) +
  theme_light() +
  theme(legend.text = element_text(size = 10, face = "bold"),
        legend.position = c(0.1, 0.93),
        legend.background = element_rect(fill = NA, color = NA),
        axis.ticks = element_line(color = "black", size = 0.3),
        panel.border = element_blank())

# save plot
# ggsave(file.path(dir3, "TFEA Volcano Plot.pdf"), vp, width = 8, height = 5.5)
ggsave(file.path(dir3, "TFEA Volcano Plot.svg"), vp, width = 8, height = 5.5)
ggsave(file.path(dir3, "TFEA Volcano Plot.pdf"), vp, width = 8, height = 5.5)

Enrichr Analysis

Perform TFEA using the Enrichr API [6].

# define API call parameters
my_genes = gsub("^\\s+|\\s+$", "", dat[, Symbol])
my_library = "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X"
my_description = "ADRA Marker Set"

Function for API call. This function was adapted from the Python scripts provided in the Enrichr API documentation here.

enrichr_API = function(genes, description, library) {
  
  # create API call to add gene list
  add_url = "http://amp.pharm.mssm.edu/Enrichr/addList"
  genes_str = paste(genes, collapse="\n")
  payload = list(list = genes_str, description = description)
  
  # make and parse API call
  add_request = httr::POST(url = add_url, body = payload)
  add_response = httr::content(add_request, as = "text", encoding = "UTF-8") %>% fromJSON()
  
  # wait before next call
  Sys.sleep(0.5)
  
  # create URL for next API call to calculate enrichment
  enrich_url = paste0("http://amp.pharm.mssm.edu/Enrichr/enrich", "?userListId=",
                      add_response$userListId, "&backgroundType=", library)
  
  # make and parse next API call
  request = httr::GET(url = enrich_url)
  response = httr::content(request, as = "text", encoding = "UTF-8") %>% fromJSON() %>% .[[1]]
  
  # convert to data.table
  convDT = function(res) { res[[6]] = toString(res[[6]]); return(res)  }
  response = map(response, convDT) %>% rbindlist()
  setnames(response, c("Rank", "Term", "p.value", "z.score",
                       "combined.score", "Genes", "adj.p.value",
                       "old.p.value", "old.adj.p.value"))
  
  # calculate -log10(adj. p-value), then filter
  response[, log.adj.p := -log10(adj.p.value)]
  response = response[, .(Term, p.value, adj.p.value, log.adj.p, z.score, combined.score, Genes)]
  
  # return response object
  return(response)

}

Make API call to calculate enrichment.

# make API call
enrichr = enrichr_API(my_genes, my_description, my_library)

# write to Excel file
write.xlsx(enrichr, file.path(dir3, "Enrichr Results.xlsx"),
           asTable = TRUE, tableStyle = "TableStyleMedium19")

Enrichr Plot

Function to plot Enrichr results.

plot_enrichr = function(enrichr_results, library = NULL) {
  
  # for TFEA only
  enrichr_results[, Term := word(Term, 1)]
  
  # select top TFs
  enrichr_results = enrichr_results[order(-log.adj.p), ][1:10, ]
  enrichr_results[, Term := factor(Term, levels = rev(Term))]
  
  # create barplot
  p = ggplot(enrichr_results, aes(x = Term, y = log.adj.p, fill = log.adj.p))+
    geom_col() +
    coord_flip() +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
    geom_text(x = 1.7, y = -log10(0.05) + 0.03,
              label = "italic(p)*'-value < 0.05'", angle = 270,
              size = 2.5, parse = TRUE) +
    scale_fill_gradient(low="#FFD166", high="#A63446") +
    scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.05))) +
    theme_light() +
    theme(plot.title = element_text(size = 16, color = "black", face = "bold", hjust = 0.5),
          axis.title.x = element_blank(), axis.title.y = element_blank(),
          legend.position = "none",
          strip.text = element_text(size=12, color = "black", face="bold"),
          strip.background = element_rect(color=NA, fill="#D9D9D9", size=1, linetype="solid"))
  
  if(!is.null(library)) {
    enrichr_results[, Library := library]
    p = p + facet_wrap(Library ~ ., scales = "free", ncol = 1)
  } else {
    p = p + 
      ylab(bquote(bold(-log[10]*'(adj. '*bolditalic(p)*'-value)'))) +
      theme(axis.title.x = element_text(size = 14, color = "black"))
  }
  
  return(p)

}

Finally, create and save the Enrichr plot.

# create plot
cplots = plot_enrichr(enrichr)

# save figure
ggsave(file.path(dir3, "Enrichr Barplot.pdf"), cplots, height = 4, width = 12)

To create plots of multiple Enrichr libraries, apply purrr::map over the library names (i.e., where the function argument .f is plot_enrichr(enrichr, library_name)), then use ggarrange(plotlist = ., ncol = 1, align = "hv") to create a composite figure.

[1]
Lachmann A, Xu H, Krishnan J, Berger SI, Mazloom AR, Ma’ayan A. ChEA: Transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics (Oxford, England) 2010;26:2438–44. https://doi.org/10.1093/bioinformatics/btq466.
[2]
Dunham I, Kundaje A, Aldred SF, Collins PJ, Davis CA, Doyle F, et al. An integrated encyclopedia of DNA elements in the human genome. Nature 2012;489:57–74. https://doi.org/10.1038/nature11247.
[3]
Fishilevich S, Nudel R, Rappaport N, Hadar R, Plaschkes I, Iny Stein T, et al. GeneHancer: Genome-wide integration of enhancers and target genes in GeneCards. Database 2017;2017. https://doi.org/10.1093/database/bax028.
[4]
Chèneby J, Gheorghe M, Artufel M, Mathelier A, Ballester B. ReMap 2018: An updated atlas of regulatory regions from an integrative analysis of DNA-binding ChIP-seq experiments. Nucleic Acids Research 2018;46:D267–75. https://doi.org/10.1093/nar/gkx1092.
[5]
Puente-Santamaria L, Wasserman WW, Del Peso L. TFEA.ChIP: A tool kit for transcription factor binding site enrichment analysis capitalizing on ChIP-seq datasets. Bioinformatics (Oxford, England) 2019;35:5339–40. https://doi.org/10.1093/bioinformatics/btz573.
[6]
Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, et al. Enrichr: A comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research 2016;44:W90–97. https://doi.org/10.1093/nar/gkw377.

References

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.