This R script analyzes mass spectrometry data from mouse brains, performing pathway enrichment analysis using Reactome and creating network visualizations of protein-protein interactions. It includes heatmap generation, Circos plots, chord diagrams, and interactive network graphs to explore relationships between proteins and their functional groups.
Load requisite packages and define directories. Note that this script may also use my personal utilities package brainstorm
, which can be downloaded via devtools::install_github("ayushnoori/brainstorm")
.
Note that directories are relative to the R project path.
CP:REACTOME collection with 100 overlaps shown, 1289 gene sets in collection, 84 genes in comparison (converted 87 identifiers into 84 NCBI [Entrez] genes), and 42,739 genes in the universe (N).
OLD results: CP:REACTOME collection with 100 overlaps shown, 1261 gene sets in collection, 84 genes in comparison (converted 87 identifiers into 84 NCBI [Entrez] genes), and 42,726 genes in the universe (N).
# Read data
reactome_enr = fread(here(data_dir, "enrichment_results.tsv"), header = T)
# Transform data
reactome_enr_plot = copy(reactome_enr) %>%
.[, logQ := -log(`FDR q-value`, base = 10)] %>%
.[, Ratio := `k/K`] %>%
.[1:20] %>%
.[, Description := factor(Description, levels = rev(Description))]
# Plot graph
p = ggplot(reactome_enr_plot, aes(x = Description, y = logQ, fill = Ratio)) +
geom_col() +
coord_flip() +
scale_fill_gradient(low="#FFD166", high="#A63446") +
theme_light() +
labs(x = "Pathway", y = expression(bold(-log[10](paste("FDR ", italic(q), "-value"))))) +
scale_y_continuous(expand = expansion(c(0, 0.05))) +
theme(plot.title = element_text(size = 16, color = "black", face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.title = element_text(face = "bold"))
# # Save plot
# # ggsave(here(results_dir, "MS Reactome Enrichment.pdf"), p, width = 10, height = 12)
# ggsave(here(results_dir, "MS Reactome Enrichment.pdf"), p, width = 8, height = 4)
Read CP:Reactome GMT file downloaded from https://www.gsea-msigdb.org/gsea/msigdb/mouse/collections.jsp.
# reactome_gmt = qusage::read.gmt(here(data_dir, "m2.cp.reactome.v2023.2.Mm.symbols.gmt"))
reactome_gmt = qusage::read.gmt(here(data_dir, "m2.cp.reactome.v2024.1.Mm.symbols.gmt"))
# Read gene list
gene_list = readxl::read_excel(here(data_dir, "List of 89 proteins false for GFP.xlsx"), sheet = "Functional Annotations") %>% as.data.table()
Define Excel styles for workbook object.
# Header style
hs = createStyle(fontColour = "#FAF3DD", fgFill = "#337CA0",
fontName = "Arial Black", halign = "center",
valign = "center", textDecoration = "Bold",
border = "Bottom", borderStyle = "thick", fontSize = 14)
# Row style #1
r1 = createStyle(fontColour = "#363635", fgFill = "#FAF3DD",
fontName = "Arial", fontSize = 10)
# Row style #2
r2 = createStyle(fontColour = "#363635", fgFill = "#C4E4E9",
fontName = "Arial", fontSize = 10)
# Subheader style
sh = createStyle(fontColour = "#363635", fgFill = "#FFA69E",
fontName = "Arial", textDecoration = "Bold",
border = "TopBottom", borderStyle = "thick")
Define function to add sheet to workbook, where clus
is a vector of cluster labels, sheet
is the worksheet name, and header
is the vector of header indices.
add_sheet = function(datWB, clus, sheet, header, wb) {
# Add worksheet
addWorksheet(wb, sheetName = sheet)
writeDataTable(wb, sheet, x = datWB, tableStyle = "TableStyleMedium15", headerStyle = hs, bandedRows = FALSE)
setColWidths(wb, sheet, cols = 1:12, widths = c(60, 60, 8, 8, 8, 8, 8, 8, 16, 16, 200, 200))
freezePane(wb, sheet, firstRow = TRUE, firstCol = FALSE)
# Add styling
even = clus %% 2 == 0; even[is.na(even)] = FALSE
addStyle(wb, sheet, r1, rows = which(even) + 1, cols = 1:12, gridExpand = T)
addStyle(wb, sheet, r2, rows = which(!even) + 1, cols = 1:12, gridExpand = T)
addStyle(wb, sheet, sh, rows = header + 1, cols = 1:12, gridExpand = T)
}
Parse, cluster, and tabulate pathway enrichment results. The similarity matrix is created by computing the Jaccard similarity coefficient, then converted to Jaccard distance by subtracting from 1.
# Function to compute Jaccard similarity coefficient
jaccard = function(a, b) {return(length(intersect(a,b))/length(union(a, b)))}
# Read file
# dat = fread(file.path(data_dir, fname))
dat = copy(reactome_enr)
setnames(dat, c("Pathway", "K", "Description", "k", "Ratio", "p", "q"))
# # Merge with mapping, clean pathway names, get member list
# dat = merge(dat, mapDB, by = "Pathway", all.x = TRUE) %>%
# .[, Pathway := gsub("_", " ", Pathway)] %>%
# .[, Pathway := sub(".*? ", "", Pathway)] %>%
# .[, Members := strsplit(ENTREZ, ",")]
# Merge with mapping
dat[, Genes := reactome_gmt[Pathway]]
# Get genes in overlap
genes_in_overlap = function(all_genes) {
return(all_genes[all_genes %in% gene_list$Gene])
}
dat[, Overlap := map(Genes, genes_in_overlap)]
dat[, k_M := map_dbl(Overlap, length)]
dat[, K_M := map_dbl(Genes, length)]
dat[, Ratio_M := round(k_M/K_M, 4)]
# create pairwise similarity matrix using Jaccard index
sim = dat[, Genes] %>% outer(., ., FUN = Vectorize(jaccard))
rownames(sim) = dat[, Pathway]; colnames(sim) = dat[, Pathway]
# specify number of clusters
nclust = 15
# h = 0.9999
# convert to dissimilarity matrix, then perform hierarchical clustering
pclust = as.dist(1 - sim) %>%
hclust() %>%
cutree(k = nclust) %>%
# cutree(h = 0.95) %>%
data.table(Pathway = names(.), Cluster = .)
# # get number of clusters
# nclust = pclust[, length(unique(Cluster))]
# print(nclust)
# merge with cluster, then group by cluster and order by q-value
dat = merge(dat, pclust, by = "Pathway", all.x = TRUE) %>%
.[order(Cluster, q), ] %>%
# .[, .(Pathway, Description, K, k, Ratio, p, q, Cluster, Genes)]
.[, .(Pathway, Description, K, K_M, k, k_M, Ratio, Ratio_M, p, q, Cluster, Overlap, Genes)]
# rename columns for Excel workbook
datWB = copy(dat)
# setnames(datWB, c("Pathway", "Description", "K (Genes in Pathway)",
# "k (Genes in Overlap)", "k/K", "p-value", "FDR q-value",
# "Cluster", "Genes"))
setnames(datWB, c("Pathway", "Description", "K (Genes in Pathway)", "K (Pathway Measured)",
"k (Genes in Overlap)", "k (Overlap Measured)", "k/K", "k/K (Ratio Measured)", "p-value",
"FDR q-value", "Cluster", "Overlap", "Genes"))
header = datWB[, which(!duplicated(Cluster))] + 0:(nclust - 1)
datWB[, `K (Genes in Pathway)` := as.double(`K (Genes in Pathway)`)]
datWB[, `k (Genes in Overlap)` := as.double(`k (Genes in Overlap)`)]
# add rows for manual annotation
for (z in header) {
datWB = tibble::add_row(datWB, .before = z)
datWB[z, Pathway := paste0("Pathway #", datWB[z + 1, "Cluster"])]
# Get next row
z_next = ifelse(z != max(header), header[which(header == z) + 1] - 1, z + 1)
# Get lists of genes
k_cluster = unique(unlist(datWB[(z + 1):z_next, `Overlap`]))
K_cluster = unique(unlist(datWB[(z + 1):z_next, `Genes`]))
ratio_cluster = length(k_cluster)/length(K_cluster)
datWB[z, `k (Overlap Measured)` := length(k_cluster)]
datWB[z, `K (Pathway Measured)` := length(K_cluster)]
datWB[z, `k/K (Ratio Measured)` := ratio_cluster]
# datWB[z, `K (Genes in Pathway)` := datWB[(z + 1):z_next, mean(`K (Genes in Pathway)`)]]
# datWB[z, `k (Genes in Overlap)` := datWB[(z + 1):z_next, mean(`k (Genes in Overlap)`)]]
# datWB[z, `k/K` := datWB[(z + 1):z_next, mean(`k/K`)]]
datWB[z, `p-value` := datWB[(z + 1):z_next, mean(log(`p-value`))]]
datWB[z, `FDR q-value` := datWB[(z + 1):z_next, mean(log(`FDR q-value`))]]
}
# get cluster and sheet name, add to workbook
clus = datWB[, Cluster]; datWB[, Cluster := NULL]
# add to workbook
wb = createWorkbook()
sheet = "Reactome Enrichment"
add_sheet(datWB, clus, sheet, header, wb)
Save workbook object.
Complete pathway annotations by manually editing the file Pathway Enrichment Analysis Annotated.xlsx
. After annotations are complete, execute the following chunks which compute cluster statistics.
# read data
dat = read.xlsx(here(results_dir, "Pathway Enrichment Analysis Annotated_ASP_08262024.xlsx"), sheet = sheet, check.names = TRUE) %>% as.data.table()
dat = dat[, .SD, .SDcols = c("Pathway", "Description", "K..Pathway.Measured.", "k..Overlap.Measured.", "k.K..Ratio.Measured.", "p.value", "FDR.q.value", "Genes")]
setnames(dat, c("Pathway", "Description", "K", "k", "Ratio", "p", "q", "Genes"))
# get header indices
header = dat[, which(is.na(Genes))]
# extract cluster labels
clus = dat[header, .(Pathway, Description, Ratio)]
nclus = c(header[-1], nrow(dat) + 1) - (header + 1)
# re-create cluster labels
dat = dat[-header, ][, Cluster := rep(clus$Pathway, nclus)]
# compute cluster statistics
# clusdat = dat[, .(Ratio = sum(k)/sum(K), logQ = mean(-log10(q))), by = Cluster]
clusdat = dat[, .(logQ = mean(-log10(q))), by = Cluster]
clus = merge(clus, clusdat, by.x = "Pathway", by.y = "Cluster")
Function to plot pathway data in a barplot for each enrichment analysis (i.e., for each database). Note that facet_wrap
is used for visual purposes only. Each plot created has a single facet.
# Create results table
datDB = copy(clus) %>%
.[order(logQ), ] %>%
# .[, Description := stringi::stri_rand_strings(nrow(clus), 5)] %>%
.[, Description := factor(Description, levels = Description)]
# Plot results
p_annotated = ggplot(datDB, aes(x = Description, y = logQ, fill = Ratio)) +
geom_col() +
geom_hline(yintercept = -log10(0.25), linetype = "dashed", color = "red") +
coord_flip() +
scale_fill_gradient(low="#FFD166", high="#A63446") +
labs(fill = "Gene Ratio") +
labs(x = "Pathway", y = expression(bold(-log[10](paste("FDR ", italic(q), "-value"))))) +
scale_y_continuous(expand = expansion(c(0, 0.05))) +
theme_light() +
theme(plot.title = element_text(size = 16, color = "black", face = "bold", hjust = 0.5),
# axis.title.x = element_blank(), axis.title.y = element_blank(),
axis.title = element_text(face = "bold"),
legend.title = element_text(face = "bold"))
# Save plot
ggsave(here(results_dir, "MS Reactome Enrichment Annotated.pdf"), p_annotated, width = 8, height = 4)
Load requisite packages and define directories. Note that the GitHub package mattflor/chorddiag
is used to create interactive chord diagrams using the Javascript visualization library D3. This package can be downloaded via devtools::install_github("mattflor/chorddiag")
.
This script also uses my personal utilities package brainstorm
, which can be downloaded via devtools::install_github("ayushnoori/brainstorm")
.
# fast file system operations
library(fs)
# access STRING API
library(httr)
# base visualization
library(ggplot2)
library(RColorBrewer)
# graph libraries
library(igraph)
library(ggraph)
library(graphlayouts)
# heatmap
library(ComplexHeatmap)
# chord diagrams
library(circlize)
# devtools::install_github("mattflor/chorddiag")
library(chorddiag)
# interactive network
library(visNetwork)
Note that directories are relative to the R project path.
# Define directory
network_dir = here(results_dir, "Network_analysis")
Read data, then count the number of input proteins.
# read data
dat = fread(file.path(data_dir, "89_GFP_negative_proteins.csv"), encoding = "UTF-8") %>%
setnames(colnames(.), c("UniProt", "Protein", "Gene", "Group")) %>%
.[UniProt != "GFAP"] %>%
.[order(Group)]
nrow(dat) %>% paste0("Input Proteins: ", .) %>% message
# Update group assignments
# dat[Gene == "G3bp1", Group := "RNA-binding protein"]
dat[Gene == "Rack1", Group := "Protein translation"]
dat[Gene == "G3bp1", Group := "Proteostasis"]
# show data
show_table(head(dat, 20))
STRING is a database of known and predicted protein-protein interactions which allow the creation of functional protein association networks [@szklarczyk_string_2019].
Map from gene symbols to STRING IDs using the STRING API. Proteins whose preferred IDs differ from their query symbol are later replaced before the network is constructed [@the_uniprot_consortium_uniprot_2021; @tweedie_genenamesorg_2021]. As expected, the immunoglobulins are excluded as they are not represented in the STRING database.
# root_api = "https://version-12.0.string-db.org/api"
root_api = "https://string-db.org/api"
# construct query URL
map_symbols = list(identifiers = paste(dat[, Gene], collapse="%0d"),
species = "10090", echo_query = "1", caller_identity = "SerranoPozoLab")
# complete API call to map IDs
id_request = httr::POST(url = paste0(root_api, "/tsv/get_string_ids"), body = map_symbols)
ids = httr::content(id_request, as = "text", encoding = "UTF-8") %>% fread()
# check for multiple mappings
ids[, any(duplicated(queryIndex))] %>% paste0("Multiple Mappings: ", ., "\n") %>% message()
# check for duplicate mappings
diff = ids[queryItem != preferredName]
diff[, paste(queryItem, preferredName, sep = " --> ")] %>%
c("Different Mappings: ", .) %>% paste0("\n") %>% message()
# count total proteins with mappings
nrow(ids) %>% paste0("Proteins with Mappings: ", ., "/", nrow(dat), "\n") %>% message()
# print excluded proteins
dat[!(Gene %in% ids$queryItem), paste(Gene, collapse = ", ")] %>%
paste0("Excluded Markers: ", .) %>% message()
Construct the network using another API call. Note that, as above, we use a POST
request rather than the simpler GET
to circumvent the character limit of the latter.
# construct query URL
get_network = list(identifiers = paste(ids[, stringId], collapse="%0d"),
species = "10090", echo_query = "1", caller_identity = "SerranoPozoLab")
# complete API call to retrieve network
network_request = httr::POST(url = paste0(root_api, "/tsv/network"), body = get_network)
network = httr::content(network_request, as = "text", encoding = "UTF-8") %>% fread()
# count total included proteins
network[, c(preferredName_A, preferredName_B)] %>% uniqueN() %>%
paste0("Proteins Included in Network: ", ., "/", nrow(dat), "\n") %>% message()
# print excluded proteins
network[, c(preferredName_A, preferredName_B)] %>%
c(diff$queryItem, .) %>%
{ dat[!(Gene %in% .), paste(Gene, collapse = ", ")] } %>%
paste0("Excluded Proteins: ", .) %>% message()
Use igraph
functions to create the network. Proteins whose preferred IDs differ from their query symbol are replaced before the network is constructed. Note that the complete dat
object, including the markers IGHA1, IGHG1, IGHM, and KIF21B (which have no connections), is passed to the graph_from_data_frame
call; hence, while these nodes will be included in the vertex list of the network object, they will have a centrality of 0 and will not be present when visualizing the network.
# remove unneeded columns and duplicate rows
network[, c("stringId_A", "stringId_B", "ncbiTaxonId") := NULL]
network = unique(network)
# replace with correct query symbols
replace_diff = function(net, r, pname, qname) { net[get(r) == pname, (r) := qname] }
pwalk(diff[, .(preferredName, queryItem)], ~replace_diff(network, "preferredName_A", .x, .y))
pwalk(diff[, .(preferredName, queryItem)], ~replace_diff(network, "preferredName_B", .x, .y))
setnames(network, old = 1:2, c("nodeA", "nodeB"))
# construct network
setcolorder(dat, "Gene")
net = graph_from_data_frame(d = network, vertices = dat, directed = FALSE)
# show network
show_table(head(network, 20))
Below is a key of column names which represent evidence for interaction in the STRING database.
Name | Definition |
---|---|
score | combined score |
nscore | gene neighborhood score |
fscore | gene fusion score |
pscore | phylogenetic profile score |
ascore | coexpression score |
escore | experimental score |
dscore | database score |
tscore | text mining score |
The assortativity coefficient is positive if similar vertices - based on some external property, in this case, we use functional category - tend to connect to each other, and negative otherwise.
# calculate assortativity
assort = V(net)$Group %>% as.factor() %>% as.integer() %>% assortativity_nominal(net, ., directed = FALSE)
message(paste("Assortativity Coefficient:", assort))
First, define the color palette for all subsequent plots.
# define color palette
cols = colorRampPalette(c("#B4436C", "#F2BAC9", "#F7A278", "#FCEFB4", "#C8E9A0", "#6DD3CE", "#91A6FF", "#E5C2FF", "#B49082", "#ABB8C4"))(uniqueN(dat[, Group]))
# set names and order
names(cols) = dat[, Group] %>% unique() %>% .[order(.)]
# set alternating order for chord diagram
alt = dat[, .N, by = Group] %>% .[order(Group), N] %>% order()
alt = c(rbind(alt[1:8], rev(alt[9:16])))
# set order for chord diagram
# alt = dat[, .N, by = Group] %>% .[order(Group), N] %>% order()
Plot a heatmap of the adjacency matrix. Cluster rows and columns and annotate by functional group.
# extract adjacency matrix
hm_dat = as_adj(net, attr = "score", sparse = FALSE)
diag(hm_dat) = 1
# create heatmap annotations
hm_annos = dat[, Group]
hm_cols = list(Group = cols)
# create group labels, add line breaks
hm_grp = unique(hm_annos)
# hm_grp[c(2, 3, 7, 8, 18)] = c("Blood-Brain\nBarrier", "Calcium\nHomeostasis", "Insulin\nSignaling", "Intracellular\nTrafficking", "Water/K+\nHomeostasis")
# top annotation
top_annos = HeatmapAnnotation(Group = anno_block(gp = gpar(fill = cols),
labels = hm_grp,
labels_gp = gpar(fontsize = 3, lineheight = 0.8)))
# left annotation
left_annos = rowAnnotation(Group = anno_block(gp = gpar(fill = cols),
labels = hm_grp,
labels_gp = gpar(fontsize = 3, lineheight = 0.8)))
# function to set outline of each functional group
group_outline = function(j, i, x, y, width, height, fill) {
if(i[1] == j[1]) grid.rect(gp = gpar(lwd = 2, fill = "transparent"))
}
# function to establish color scale
color_scale = function(maxcol, val) { colorRamp2(c(0, 1), c("#F3F4F7", maxcol))(val) }
# function to set color of each cell, darker color is #9DA5BE
color_cell = function(j, i, x, y, width, height, fill) {
if(hm_annos[i] == hm_annos[j]) {
grid.rect(x = x, y = y, width = width, height = height,
gp = gpar(col = NA, fill = color_scale(cols[hm_annos[i]], hm_dat[i, j])))
} else {
grid.rect(x = x, y = y, width = width, height = height,
gp = gpar(col = NA, fill = color_scale("#525C7A", hm_dat[i, j])))
}
}
# plot heatmap
hm = Heatmap(hm_dat,
col = c("#F3F4F7", "red"),
row_split = hm_annos, column_split = hm_annos,
cluster_row_slices = FALSE, cluster_column_slices = FALSE,
row_gap = unit(1, "mm"), column_gap = unit(1, "mm"),
column_title = NULL, row_title = NULL,
top_annotation = top_annos, left_annotation = left_annos,
# cell_fun = color_cell,
layer_fun = group_outline,
show_heatmap_legend = FALSE)
# heatmap_legend_param = list(
# legend_direction = "horizontal",
# legend_width = unit(6, "cm"))
# )
# save heatmap
pdf(file.path(network_dir, "Network Heatmap.pdf"), width = 15, height = 15)
print(hm)
dev.off()
Calculate the eigenvalue centrality scores for the entire graph. Then, remove isolated nodes for further visualization.
# calculate centrality
cent = eigen_centrality(net)$vector %>% data.table(Gene = names(.), Centrality = .)
# add to vertex data
dat = dat %>% merge(cent, by = "Gene") %>% .[order(-Centrality)]
vertex_attr(net, "Centrality") = cent[, Centrality]
# clean ID mapping
ids = ids[, .(queryItem, stringId, preferredName, annotation)]
setnames(ids, c("Gene", "STRING.ID", "STRING.Symbol", "STRING.Annotation"))
# merge and save marker information
dat %>% merge(ids, by = "Gene", all.x = TRUE, sort = FALSE) %>%
fwrite(file.path(network_dir, "STRING MS Annotations.csv"))
# remove isolated vertices from network
# sub = delete_vertices(net, which(igraph::degree(net) <= 2))
# sub = delete_vertices(net, which(igraph::degree(net) <= 1))
sub = net
paste0("Proteins Retained in Network: ", length(V(sub)), "/", nrow(dat)) %>% message()
# subset colors
cols = cols[names(cols) %in% unique(vertex_attr(sub, "Group"))]
# sub = copy(net)
First, create a generic network plotting function which is called multiple times throughut the script.
plot_network = function(net, layout, minmax, ..., arc = TRUE) {
edge_geom = ifelse(arc, geom_edge_arc0, geom_edge_link0)
p = ggraph(net, layout, ...) +
edge_geom(aes(width = escore), alpha = 0.4) +
scale_edge_width(range = c(0.2, 0.9)) +
geom_node_point(aes(color = Group, size = Centrality)) + scale_size(range = minmax) +
scale_color_manual(values = cols) +
theme_graph(fg_text_colour = "black", base_family = "Helvetica") +
guides(edge_width = FALSE, size = FALSE) +
labs(color = "Group")
return(p)
}
Next, create a function to plot a circos graph of the network, then plot the figure with and without labels. The labeling code is inspired by this tutorial from the R Graph Gallery. Spaces are used as an efficient trick to keep labels from overlapping with the nodes. Although the labels are truncated in the figure, they can be recovered with Inkscape (or an analogous vector editing tool).
ngrp = uniqueN(vertex_attr(sub, "Group"))
plot_circos = function(net, minmax, fname, w, h, legend = FALSE) {
p = plot_network(net, "linear", minmax, circular = TRUE, sort.by = Group)
if(!legend) { p = p + theme(legend.position = "none") } else {
p = p + # geom_node_label(aes(label = name), repel = TRUE, alpha = 0.8, segment.size = NA) +
geom_node_text(aes(label = Label, angle = Angle, hjust = Hjust)) +
theme(legend.title = element_text(face = "bold", size = 18, hjust = 0.5),
legend.text = element_text(size = 14), legend.position = "bottom",
plot.margin = unit(rep(2, 4), "cm"))
}
ggsave(file.path(network_dir, fname), p, width = w, height = h)
}
# generate angle for labels
md = sub %>%
{ data.table(Name = vertex_attr(., "name"), Angle = length(V(.))) } %>%
.[, Angle := Angle %>% { (1:.[1] - 0.5)/. } %>% { 90 - (360*.) }] %>%
.[, Hjust := ifelse(Angle < -90, 1, 0)] %>%
.[, Label := ifelse(Angle < -90, paste0(Name, " "), paste0(" ", Name))] %>%
.[, Angle := Angle %>% ifelse(. < -90, . + 180, .)]
# assign to plot
vertex_attr(sub, "Angle") = md$Angle
vertex_attr(sub, "Hjust") = md$Hjust
vertex_attr(sub, "Label") = md$Label
# create and save plot
# plot_circos(sub, c(2, 4), "Circos Plot.pdf", 8.5, 8.5)
plot_circos(sub, c(2, 15), "Circos Plot with Labels.pdf", 18, 19, TRUE)
First, transform the network to an adjacency matrix, then group by function. We also define a utility merge
function to retrieve the group for an arbitrary set of symbols - this will be useful later.
# utility function to retrieve group
get_grp = function(x, var) {
merge(x, dat[, .(Gene, Group)], by.x = var, by.y = "Gene", all.x = TRUE, sort = FALSE)
}
# melt to adjacency list
adj = as.matrix(as_adj(sub)) %>% reshape2::melt() %>% setDT()
# replace protein names with groups, then convert back to matrix
adj = adj %>% .[value == 1, ] %>% get_grp("Var1") %>% get_grp("Var2") %>%
.[, .(Group.x, Group.y)] %>% graph_from_data_frame() %>% as_adj(sparse = FALSE)
# check if name missing from adjacency matrix
for(color_name in names(cols)) {
if(!color_name %in% colnames(adj)) {
adj = cbind(adj, rep(0, times = nrow(adj)))
colnames(adj)[ncol(adj)] <- color_name
new_row = setNames(rep(0, times = ncol(adj)), colnames(adj))
adj = rbind(adj, new_row)
rownames(adj)[nrow(adj)] = color_name
}
}
# reorder alphabetically to match colors
adj = names(cols) %>% adj[., .]
The package circlize
is used to create static chord diagrams, while the GitHub package mattflor/chorddiag
is used to create interactive chord diagrams using the Javascript visualization library D3. Notice that, for the static plot, the alternating order defined previously (in alt
) is used.
# static plot
pdf(file.path(network_dir, "Chord Diagram.pdf"), 12, 12)
# circlize::chordDiagram(adj[alt, alt], transparency = 0.5, grid.col = cols[alt])
circlize::chordDiagram(adj, transparency = 0.5, grid.col = cols)
dev.off()
# interactive plot
chorddiag(adj, groupColors = as.character(cols), groupnamePadding = 30, groupnameFontsize = "10") %>%
htmlwidgets::saveWidget(., file.path(network_dir, "Interactive Chord Diagram.html"))
Create manual grouped layout to cluster by functional group (inspired by igraph::crossing()
and this StackOverflow post).
# check if two nodes are within the same cluster
el = as_edgelist(sub) %>% as.data.table() %>%
get_grp("V1") %>% get_grp("V2") %>%
# .[, Weights := ifelse(Group.x == Group.y, yes = 5, no = 25)]
.[, Weights := ifelse(Group.x == Group.y, yes = 15, no = 25)]
# # if nodes have no connections with their cluster, set weight as intermediate (to avoid clusters of n = 1)
# for(v in V(sub)$name) {
# if(el[V1 == v | V2 == v, sum(Weights == 5) <= 1]) { el[V1 == v | V2 == v, Weights := 15] }
# }
# # create manual layout
lyt = graphlayouts::layout_with_stress(sub, weights = el[, Weights]) %>%
as.data.table() %>% setnames(c("X", "Y")) %>% .[, Symbol := vertex_attr(sub, "name")]
# create manual layout
# lyt = igraph::layout_with_kk(sub, weights = el[, Weights]) %>%
# as.data.table() %>% setnames(c("X", "Y")) %>% .[, Symbol := vertex_attr(sub, "name")]
Then, create the network graph.
# create network graph
network_graph = plot_network(sub, "manual", c(1, 8), x = lyt[, X], y = lyt[, Y], arc = FALSE) +
geom_node_label(aes(label = name), repel = TRUE, alpha = 0.8,
box.padding = 0.5, segment.size = NA, label.size = 0.1, size = 8*(5/14)) +
# theme(legend.title = element_text(size = 16, face = "bold"),
# legend.text = element_text(size = 12.5), legend.position = "bottom")
theme(legend.title = element_text(face = "bold"), legend.position = "bottom")
# save network graph
ggsave(file.path(network_dir, "Network Graph.pdf"), network_graph, width = 10, height = 10)
Create an interactive network graph (with the manual layout identical to above).
# get network data
net_dat = get.data.frame(sub, "both")
# parse nodes for interactive graph
nodes = net_dat$vertices %>% as.data.table() %>%
setnames(c("name", "Centrality"), c("id", "value")) %>%
.[, label := id] %>%
.[, value := 60*(value+0.5)] %>%
.[, shape := "circle"] %>%
.[, color := factor(Group, levels = names(cols), labels = cols)] %>%
.[, .(id, label, value, shape, color)]
# parse edges for interactive graph
edges = net_dat$edges %>% as.data.table() %>%
setnames("escore", "value") %>%
.[, value := value + 0.1] %>%
.[, color := "rgba(0,0,0,0.5)"] %>%
.[, title := paste(from, to, sep = " to ")] %>%
.[, .(from, to, value, color, title)]
# create interactive network graph
int_net = visNetwork(nodes, edges) %>%
visOptions(height = 800, width = 1200,
highlightNearest = TRUE, clickToUse = TRUE,
nodesIdSelection = list(main = "Select Gene")) %>%
visInteraction(navigationButtons = TRUE, dragView = FALSE) %>%
visIgraphLayout("layout_with_kk",
coords = as.matrix(lyt[, .(X, Y)]), maxiter = 0)
visSave(int_net, file.path(network_dir, "Interactive Network Graph.html"))
If you see mistakes or want to suggest changes, please create an issue on the source repository.