snRNA-seq Quality Control

library(Seurat)
library(tidyverse)
library(hdf5r)
library(patchwork)
library(presto)
library(glmGamPoi)

tissue <- c("Midbrain")
sample_names <- c("BB48", "BB49INJ", "BB50", "BB50INJ")

# Empty list to populate seurat object for each sample
list_seurat <- list()

for (sample in sample_names) {
  # Path to data directory
  data_dir <- file.path("data/ssrnaseq_data", tissue, sample)

  # Create a Seurat object for each sample
  seurat_data <- Read10X(data.dir = data_dir)
  seurat_obj <- CreateSeuratObject(
    counts = seurat_data,
    min.features = 1,
    project = paste0(tissue, "_", sample)
  )

  # Save seurat object to list
  list_seurat[[sample]] <- seurat_obj
}

merged_seurat <- merge(
  x = list_seurat[[1]],
  y = list_seurat[2:length(list_seurat)],
  add.cell.id = names(list_seurat)
)

merged_seurat <- JoinLayers(merged_seurat)
merged_seurat

An object of class Seurat 
30521 features across 5234 samples within 1 assay 
Active assay: RNA (30521 features, 0 variable features)
 1 layer present: counts

# Check that the merged object has the appropriate sample-specific prefixes
head(merged_seurat@meta.data)

                           orig.ident nCount_RNA nFeature_RNA
BB48_AAACCCTGTCTGGCAC-1 Midbrain_BB48       9358         3816
BB48_AAACCCTGTTCGCTAG-1 Midbrain_BB48        502          386
BB48_AAACGATGTCTTACCC-1 Midbrain_BB48       8015         3381
BB48_AAACGATGTGGCCAGA-1 Midbrain_BB48        517          433
BB48_AAACGATGTTATAGGG-1 Midbrain_BB48       7065         3208
BB48_AAACTCCGTAAGCAAT-1 Midbrain_BB48        626          485

tail(merged_seurat@meta.data)

                                 orig.ident nCount_RNA nFeature_RNA
BB50INJ_TGTACTGAGGGCCATT-1 Midbrain_BB50INJ      11366         3644
BB50INJ_TGTACTGAGGGTATGG-1 Midbrain_BB50INJ       4424         2230
BB50INJ_TGTAGCCAGTGAACCC-1 Midbrain_BB50INJ       4303         2224
BB50INJ_TGTAGTTAGCATGTAT-1 Midbrain_BB50INJ       2817         1550
BB50INJ_TGTAGTTAGTGGCCTG-1 Midbrain_BB50INJ        537          461
BB50INJ_TGTCCAAAGTGACCCG-1 Midbrain_BB50INJ        833          669

merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)
# Create metadata dataframe
metadata <- merged_seurat@meta.data
# Add cell IDs to metadata
metadata$cells <- rownames(metadata)
# Create sample column
metadata$sample <- metadata$orig.ident
# Rename columns
metadata <- metadata %>%
        dplyr::rename(nUMI = nCount_RNA,
                      nGene = nFeature_RNA)
# Add metadata back to Seurat object
merged_seurat@meta.data <- metadata
                           
# Create .RData object to load at any time
save(merged_seurat, file=file.path("data/ssrnaseq_data/",tissue,paste0(tissue,"_merged_filtered_seurat.RData")))

# Visualize the number of cell counts per sample
metadata %>% 
    ggplot(aes(x=sample, fill=sample)) + 
    geom_bar() +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells")

# Visualize the number UMIs/transcripts per cell
metadata %>% 
    ggplot(aes(color=sample, x=nUMI, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    theme_classic() +
    ylab("Cell density") +
    geom_vline(xintercept = 100)

# Visualize the distribution of genes detected per cell via histogram
metadata %>% 
    ggplot(aes(color=sample, x=nGene, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    theme_classic() +
    scale_x_log10() + 
    geom_vline(xintercept = 100)

# Visualize the overall complexity of the gene expression by visualizing 
# the genes detected per UMI (novelty score)
metadata %>%
    ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
    geom_density(alpha = 0.2) +
    theme_classic() +
    geom_vline(xintercept = 0.8)

# Visualize the correlation between genes detected and number of UMIs and
# determine whether strong presence of cells with low numbers of genes/UMIs
metadata %>%
    ggplot(aes(x=nUMI, y=nGene, color=log10GenesPerUMI)) +
    geom_point() +
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() +
    scale_y_log10() +
    theme_classic() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    facet_wrap(~sample)

# Barcode rank plot (knee plot): for each sample, rank barcodes by UMI count
# (descending) and plot rank vs. UMI count on a log-log scale.
# The "knee" separates real cells (high UMI) from empty droplets (low UMI).
metadata %>%
    group_by(sample) %>%
    arrange(desc(nUMI), .by_group = TRUE) %>%
    mutate(rank = row_number()) %>%
    ungroup() %>%
    ggplot(aes(x = rank, y = nUMI, color = sample)) +
    geom_line(linewidth = 0.8) +
    scale_x_log10(labels = scales::label_comma()) +
    scale_y_log10(labels = scales::label_comma()) +
    annotation_logticks(sides = "bl", color = "grey50") +
    theme_bw() +
    theme(
        panel.grid.major = element_line(color = "grey88"),
        panel.grid.minor = element_line(color = "grey93"),
        plot.title       = element_text(hjust = 0.5, face = "bold")
    ) +
    labs(
        title = "Barcode Rank Plot",
        x     = "Barcodes",
        y     = "UMI counts",
        color = "Sample"
    ) +
    guides(color = guide_legend(override.aes = list(linewidth = 2)))

--- title: 'snRNA-seq Quality Control' teaching: 10 exercises: 2 optional: false --- ```{r load-seurat, message=FALSE, warning=FALSE} library(Seurat) library(tidyverse) library(hdf5r) library(patchwork) library(presto) library(glmGamPoi) ``` ```{r load-data, message=FALSE, warning=FALSE} tissue <- c("Midbrain") sample_names <- c("BB48", "BB49INJ", "BB50", "BB50INJ") # Empty list to populate seurat object for each sample list_seurat <- list() for (sample in sample_names) { # Path to data directory data_dir <- file.path("data/ssrnaseq_data", tissue, sample) # Create a Seurat object for each sample seurat_data <- Read10X(data.dir = data_dir) seurat_obj <- CreateSeuratObject( counts = seurat_data, min.features = 1, project = paste0(tissue, "_", sample) ) # Save seurat object to list list_seurat[[sample]] <- seurat_obj } ``` ```{r inspect-data, message=FALSE, warning=FALSE} merged_seurat <- merge( x = list_seurat[[1]], y = list_seurat[2:length(list_seurat)], add.cell.id = names(list_seurat) ) merged_seurat <- JoinLayers(merged_seurat) merged_seurat ``` ```{r} # Check that the merged object has the appropriate sample-specific prefixes head(merged_seurat@meta.data) tail(merged_seurat@meta.data) ``` ```{r} merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA) # Create metadata dataframe metadata <- merged_seurat@meta.data # Add cell IDs to metadata metadata$cells <- rownames(metadata) # Create sample column metadata$sample <- metadata$orig.ident # Rename columns metadata <- metadata %>% dplyr::rename( nUMI = nCount_RNA, nGene = nFeature_RNA ) # Add metadata back to Seurat object merged_seurat@meta.data <- metadata # Create .RData object to load at any time save(merged_seurat, file = file.path("data/ssrnaseq_data/", tissue, paste0(tissue, "_merged_filtered_seurat.RData"))) ``` ```{r} # Visualize the number of cell counts per sample metadata %>% ggplot(aes(x = sample, fill = sample)) + geom_bar() + theme_classic() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + ggtitle("NCells") ``` ```{r} # Visualize the number UMIs/transcripts per cell metadata %>% ggplot(aes(color = sample, x = nUMI, fill = sample)) + geom_density(alpha = 0.2) + scale_x_log10() + theme_classic() + ylab("Cell density") + geom_vline(xintercept = 100) ``` ```{r} # Visualize the distribution of genes detected per cell via histogram metadata %>% ggplot(aes(color = sample, x = nGene, fill = sample)) + geom_density(alpha = 0.2) + theme_classic() + scale_x_log10() + geom_vline(xintercept = 100) ``` ```{r} # Visualize the overall complexity of the gene expression by visualizing # the genes detected per UMI (novelty score) metadata %>% ggplot(aes(x = log10GenesPerUMI, color = sample, fill = sample)) + geom_density(alpha = 0.2) + theme_classic() + geom_vline(xintercept = 0.8) ``` ```{r,warning=FALSE} # Visualize the correlation between genes detected and number of UMIs and # determine whether strong presence of cells with low numbers of genes/UMIs metadata %>% ggplot(aes(x = nUMI, y = nGene, color = log10GenesPerUMI)) + geom_point() + scale_colour_gradient(low = "gray90", high = "black") + stat_smooth(method = lm) + scale_x_log10() + scale_y_log10() + theme_classic() + geom_vline(xintercept = 500) + geom_hline(yintercept = 250) + facet_wrap(~sample) ``` ```{r barcode-rank-plot, warning=FALSE} # Barcode rank plot (knee plot): for each sample, rank barcodes by UMI count # (descending) and plot rank vs. UMI count on a log-log scale. # The "knee" separates real cells (high UMI) from empty droplets (low UMI). metadata %>% group_by(sample) %>% arrange(desc(nUMI), .by_group = TRUE) %>% mutate(rank = row_number()) %>% ungroup() %>% ggplot(aes(x = rank, y = nUMI, color = sample)) + geom_line(linewidth = 0.8) + scale_x_log10(labels = scales::label_comma()) + scale_y_log10(labels = scales::label_comma()) + annotation_logticks(sides = "bl", color = "grey50") + theme_bw() + theme( panel.grid.major = element_line(color = "grey88"), panel.grid.minor = element_line(color = "grey93"), plot.title = element_text(hjust = 0.5, face = "bold") ) + labs( title = "Barcode Rank Plot", x = "Barcodes", y = "UMI counts", color = "Sample" ) + guides(color = guide_legend(override.aes = list(linewidth = 2))) ```