Partition by State

This R script partitions the data into training, test, and validation sets using stratified random sampling by State (i.e., homeostatic vs. intermediate vs. reactive).

true

Dependencies

Load requisite packages and define directories. Note that this script uses my personal utilities package brainstorm, which can be downloaded via devtools::install_github(“ayushnoori/brainstorm”).

# data manipulation
library(data.table)
library(purrr)
library(magrittr)

# fast file system operations
library(fs)

# partition data
library(caret)

# utility functions
library(brainstorm)

Note that directories are relative to the R project path.

# set directories
ddir = file.path("Data", "3 - ROIs")
pdir = file.path("Data", "5 - State Partition")
dir1 = file.path("Results", "CNN", "1.2 - State Partition")
dir4 = file.path("Results", "4 - Spectral Clustering")

# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()
grp = c("Train", "Test", "Validation") %>% purrr::set_names()
pheno = c("CTRL", "AD") %>% purrr::set_names()
state = c("Homeostatic", "Intermediate", "Reactive") %>% purrr::set_names()
dirs = pmap_chr(expand.grid(pdir, celltypes, grp, state), file.path)

# remove prior directories/files if they exist
check_dir = function(fname) {if(fs::dir_exists(fname)) fs::dir_delete(fname); fs::dir_create(fname)}
walk(dirs, check_dir)

Retrieve ROI Paths

Write function to retrieve ROI paths.

retrieve_paths = function(fname) {
  
  # list TIFF files in "/<celltype> ROIs" subdirectories
  tiffs = map(celltypes, ~paste(.x, "ROIs") %>%
                file.path(fname, .) %>%
                list.files(pattern = "\\.tif$", full.names = TRUE))
  
  return(tiffs)
  
}

Then, map function over crop list.

# get crop list
crops = file.path(ddir, pheno) %>% list.files(full.names = TRUE)

# get TIFF file paths
tiffs = map(crops, retrieve_paths)

# aggregate TIFF file paths by cell type
tiffs = map(celltypes, ~unlist(map(tiffs, .x), use.names = FALSE))

Partition ROIs

Define function to partition ROIs into training, test, and validation sets.

partition_rois = function(flist, lab, sc) {
  
  # construct data table
  dat = data.table(Path = flist)
  message("\n", toupper(lab), " ANALYSIS:")
  message("Total TIFF Files: ", nrow(dat))
  message("Total ROI Measurements: ", nrow(sc))
  
  # parse metadata from file path
  dat = dat %>%
    .[, Name := basename(Path)] %>% 
    .[, Group := lab] %>% 
    .[, Condition := map_chr(strsplit(Path, "/"), 3)] %>%
    .[, Sample := flist %>% strsplit("/") %>% map_chr(4) %>%
        strsplit("_") %>% map_chr(1)] %>%
    .[, Batch := ifelse(Sample %in% c("1190", "1301", "2148", "2157",
                                      "2191", "2207"), 1, 2)] %>%
    .[, ID := gsub("(AD_|CTRL_|.tif)", "", Name)] %>%
    merge(sc[, .(ID, State)], by = "ID", all = T)
  
  # partition into test, training, and validation sets
  train_lab = dat[createDataPartition(paste(State, Condition, sep = "_"),
                                      p = 0.6, list = FALSE), Name]
  test_lab = dat[!Name %in% train_lab] %>%
    .[createDataPartition(paste(State, Condition, sep = "_"),
                          p = 0.5, list = FALSE), Name]
  
  # create partition variable
  dat %>%
    .[, Partition := "Validation"] %>%
    .[Name %in% train_lab, Partition := "Train"] %>%
    .[Name %in% test_lab, Partition := "Test"] 
  
  # construct output path
  dat[, Output := file.path(pdir, Group, Partition, State, Name)]
  
  # copy TIFF files to appropriate output folder
  pwalk(dat[, .(Path, Output)], ~fs::file_copy(.x, .y))
  
  # print results
  cat(paste("\n", lab, "ROIs:\n"))
  walk(dat[, .(Condition, Partition, Sample)], ~print(summary(factor(.x))))

  # return data table
  return(dat)
  
}

Map function over TIFF file paths.

# read spectral clustering data
all_sc = readRDS(file.path(dir4, "Z-Score Data.rds"))

# partition ROIs
all = imap(tiffs, ~partition_rois(.x, .y, all_sc[[.y]]))

# save partition result
saveRDS(all, file.path(dir1, "ROI Partition by State.rds"))

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.