Dependencies

Load requisite packages. Note that the tibble package is not loaded (to preempt namespace conflicts) but the tibble::add_row function is explicitly called.

This script also uses my personal utilities package brainstorm, which can be downloaded via devtools::install_github("ayushnoori/brainstorm").

# data manipulation
library(data.table)
library(purrr)
library(magrittr)

# string operations
library(stringr)

# Excel manipulation
library(openxlsx)

# data visualization
library(ggplot2)
library(ggpubr)

# utility functions
library(brainstorm)

Note that directories are relative to the R project path.

# set directories
ddir = file.path("Data", "2 - Pathway Enrichment Analysis")
dir2 = file.path("Results", "2 - Pathway Enrichment Analysis")

Read Data

Read data, then input the gene symbols to http://www.gsea-msigdb.org/gsea/msigdb/annotate.jsp. Compute overlaps separately against the Gene Ontology (GO:BP: GO biological process, GO:CC: GO cellular component, GO:MF: GO molecular function) [1,2] and Reactome (CP:REACTOME: Reactome gene sets) [3] databases available from the Molecular Signatures Database [4]. For each analysis, show the top 100 gene sets with FDR q-value less than 1 (i.e., no q-value cutoff). Place output files in the Data/2 - Pathway Enrichment Analysis directory.

# read data
markers = fread(file.path("Data", "ADRA Protein Set.csv"))
# cat(markers[, Symbol])

Read MSigDB mapping file (version 7.2), which was downloaded as an XML file from https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.2/msigdb_v7.2.xml, parsed in Excel, and filtered for GO/Reactome pathways only.

# read mapping file
mapDB = fread(file.path(ddir, "MSigDB Mapping v7.3.csv")) %>%
  .[, .(STANDARD_NAME, EXACT_SOURCE, MEMBERS_SYMBOLIZED, MEMBERS)]
setnames(mapDB, c("Pathway", "ID", "Symbol", "ENTREZ"))

Parse Pathway Enrichment Results

Define Excel styles for workbook object.

# header style
hs = createStyle(fontColour = "#FAF3DD", fgFill = "#337CA0",
                 fontName = "Arial Black", halign = "center",
                 valign = "center", textDecoration = "Bold",
                 border = "Bottom", borderStyle = "thick", fontSize = 14)

# row style #1
r1 = createStyle(fontColour = "#363635", fgFill = "#FAF3DD",
                 fontName = "Arial", fontSize = 10)

# row style #2
r2 = createStyle(fontColour = "#363635", fgFill = "#C4E4E9",
                 fontName = "Arial", fontSize = 10)

# subheader style
sh = createStyle(fontColour = "#363635", fgFill = "#FFA69E",
                 fontName = "Arial", textDecoration = "Bold",
                 border = "TopBottom", borderStyle = "thick")

Define function to add sheet to workbook, where clus is a vector of cluster labels, sheet is the worksheet name, and header is the vector of header indices.

add_sheet = function(datWB, clus, sheet, header, wb) {
  
  # add worksheet
  addWorksheet(wb, sheetName = sheet)
  writeDataTable(wb, sheet, x = datWB, tableStyle = "TableStyleMedium15", headerStyle = hs, bandedRows = FALSE)
  setColWidths(wb, sheet, cols = 1:10, widths = c("auto", 60, 8, 8, 8, 16, 16, 20, 20, 200))
  freezePane(wb, sheet, firstRow = TRUE, firstCol = FALSE)
  
  # add styling
  even = clus %% 2 == 0; even[is.na(even)] = FALSE
  addStyle(wb, sheet, r1, rows = which(even) + 1, cols = 1:10, gridExpand = T)
  addStyle(wb, sheet, r2, rows = which(!even) + 1, cols = 1:10, gridExpand = T)
  addStyle(wb, sheet, sh, rows = header + 1, cols = 1:10, gridExpand = T)
  
}

Parse, cluster, and tabulate pathway enrichment results. The similarity matrix is created by computing the Jaccard similarity coefficient, then converted to Jaccard distance by subtracting from 1.

# function to compute Jaccard similarity coefficient
jaccard = function(a, b) {return(length(intersect(a,b))/length(union(a, b)))}

# read pathway enrichment results
parse_pathways = function(fname, wb) {
  
  # read file
  dat = fread(file.path(ddir, fname))
  setnames(dat, c("Pathway", "K", "Description", "k", "Ratio", "p", "q"))
  
  # merge with mapping, clean pathway names, get member list
  dat = merge(dat, mapDB, by = "Pathway", all.x = TRUE) %>%
    .[, Pathway := gsub("_", " ", Pathway)] %>%
    .[, Pathway := sub(".*? ", "", Pathway)] %>%
    .[, Members := strsplit(ENTREZ, ",")]
  
  # create pairwise similarity matrix using Jaccard index
  sim = dat[, Members] %>% outer(., ., FUN = Vectorize(jaccard))
  rownames(sim) = dat[, ID]; colnames(sim) = dat[, ID]
  
  # specify number of clusters
  nclust = 15
  # h = 0.95
  
  # convert to dissimilarity matrix, then perform hierarchical clustering
  pclust = as.dist(1 - sim) %>%
    hclust() %>%
    cutree(k = nclust) %>%
    # cutree(h = 0.95) %>%
    data.table(ID = names(.), Cluster = .)
  
  # get number of clusters
  # nclust = pclust[, length(unique(Cluster))]
  # print(nclust)
  
  # merge with cluster, then group by cluster and order by q-value
  dat = merge(dat, pclust, by = "ID", all.x = TRUE) %>%
    .[order(Cluster, q), ] %>%
    .[, .(ID, Pathway, K, k, Ratio, p, q, Cluster, Symbol, ENTREZ, Description)]
  
  # rename columns for Excel workbook
  datWB = copy(dat)
  setnames(datWB, c("ID", "Pathway", "K (Genes in Pathway)",
                    "k (Genes in Overlap)", "k/K", "p-value", "FDR q-value",
                    "Cluster", "Symbol", "ENTREZ", "Description"))
  header = datWB[, which(!duplicated(Cluster))] + 0:(nclust - 1)
  
  # add rows for manual annotatioN
  for (z in header) {
    datWB = tibble::add_row(datWB, .before = z)
    datWB[z, ID := paste0("Pathway #", datWB[z + 1, "Cluster"])]
  }
  
  # get cluster and sheet name, add to workbook
  clus = datWB[, Cluster]; datWB[, Cluster := NULL]
  sheet = fname %>% gsub("\\s+Enrichment\\.csv$", "", .)
  add_sheet(datWB, clus, sheet, header, wb)
  
  # return original data
  return(dat)
  
}

Map pathway_results function over each analysis (i.e., GO: BP, GO: CC, GO: MF, and Reactome).

# get file names
files = list.files(path = ddir, pattern = "Enrichment\\.csv$")

# create workbook object
grpWB = createWorkbook()

# map over file names
pathways = map(files, ~parse_pathways(.x, grpWB))

Save workbook object.

# define file paths
raw = file.path(dir2, "Pathway Enrichment Analysis Raw.xlsx")
annot = file.path(dir2, "Pathway Enrichment Analysis Annotated.xlsx")

# save workbooks
saveWorkbook(grpWB, raw, overwrite = TRUE)
if(!file.exists(annot)) { file.copy(raw, annot) }

Aggregate Annotated Pathways

Complete pathway annotations by manually editing the file Pathway Enrichment Analysis Annotated.xlsx. After annotations are complete, execute the following chunks which compute cluster statistics.

compute_cluster = function(sheet) {
  
  # read data
  dat = read.xlsx(annot, sheet = sheet, check.names = TRUE) %>% as.data.table()
  setnames(dat,  c("ID", "Pathway", "K", "k", "Ratio", "p", "q",
                   "Symbol", "ENTREZ", "Description"))
  
  # get header indices
  header = dat[, which(is.na(Description))]
  
  # extract cluster labels
  clus = dat[header, .(ID, Pathway)]
  nclus = c(header[-1], nrow(dat) + 1) - (header + 1)
  
  # re-create cluster labels
  dat = dat[-header, ][, Cluster := rep(clus$ID, nclus)]
  
  # compute cluster statistics
  clusdat = dat[, .(Ratio = sum(k)/sum(K), logQ = mean(-log10(q))), by = Cluster]
  clus = merge(clus, clusdat, by.x = "ID", by.y = "Cluster")[, Database := sheet]
  
  return(clus)
  
}

Map compute_cluster function over each sheet of the annotations workbook (i.e., GO: BP, GO: CC, GO: MF, and Reactome) to compute cluster statistics.

# get file names
sheets = getSheetNames(annot)

# map over file names
clusters = map_dfr(sheets, compute_cluster)

# refactor cluster database labels and order by -log10(FDR q-value)
clusters = clusters %>%
  .[, ID := NULL] %>%
  .[, Database := factor(
    Database,
    levels = c("GO BP", "GO CC", "GO MF", "Reactome"),
    labels = c("GO: Biological Process", "GO: Cellular Component",
               "GO: Molecular Function", "Reactome"))] %>%
  .[order(Database, logQ), ]

Function to plot pathway data in a barplot for each enrichment analysis (i.e., for each database). Note that facet_wrap is used for visual purposes only. Each plot created has a single facet.

plot_pathways = function(datDB, DB) {
  
  datDB[, Pathway := factor(Pathway, levels = Pathway)]

  p = ggplot(datDB, aes(x = Pathway, y = logQ))+ # fill = Ratio
    geom_col() +
    coord_flip() +
    # scale_fill_gradient(low="#FFD166", high="#A63446") +
    facet_wrap(Database ~ ., scales = "free", ncol = 1) +
    # labs(fill = "Gene Ratio") + 
    scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.05))) +
    theme_light() +
    theme(plot.title = element_text(size = 16, color = "black", face = "bold", hjust = 0.5),
          axis.title.x = element_blank(), axis.title.y = element_blank(),
          legend.title = element_text(face = "bold"),
          strip.text = element_text(size=12, color = "black", face="bold"),
          strip.background = element_rect(color=NA, fill="#D9D9D9", size=1, linetype="solid"))
  
  return(p)

}

Apply function over each database to create aggregated plot.

# map over each database
cplots = imap(split(clusters, by = "Database"), plot_pathways) %>%
  ggarrange(plotlist = ., ncol = 1, nrow = 4, align = "hv") %>%
  annotate_figure(bottom = text_grob(bquote(bold(-log[10]*'(FDR '*bolditalic(q)*'-value)')),
                                     size = 16, color = "black", hjust = 0))

# save figure
ggsave(file.path(dir2, "Pathway Enrichment Analysis Barplot.pdf"), cplots, height = 16, width = 12)

[1]

Ashburner M, Ball CA, Blake JA, Botstein D, Butler H, Cherry JM, et al. Gene Ontology: Tool for the unification of biology. Nature Genetics 2000;25:25–9. https://doi.org/10.1038/75556.

[2]

The Gene Ontology Consortium. The Gene Ontology Resource: 20 years and still GOing strong. Nucleic Acids Research 2019;47:D330–8. https://doi.org/10.1093/nar/gky1055.

[3]

Jassal B, Matthews L, Viteri G, Gong C, Lorente P, Fabregat A, et al. The Reactome Pathway Knowledgebase. Nucleic Acids Research 2020;48:D498–503. https://doi.org/10.1093/nar/gkz1031.

[4]

Liberzon A, Subramanian A, Pinchback R, Thorvaldsdóttir H, Tamayo P, Mesirov JP. Molecular signatures database (MSigDB) 3.0. Bioinformatics 2011;27:1739–40. https://doi.org/10.1093/bioinformatics/btr260.

Pathway Enrichment Analysis

Dependencies

Read Data

Parse Pathway Enrichment Results

Aggregate Annotated Pathways

References

Corrections