Appendix
10.1 Calculating the ambient RNA profile
By calculating the ambient RNA profile, we can perform two important analyses to improve data quality: (1) removing empty droplets and (2) normalizing for background RNA contamination. Chapter 3 of the tutorial outlines both processes. However, the first step is to calculate the background RNA profile by quantifying gene expression for a list of barcodes known to represent background droplets rather than true cells.
When running FLAMES, one of the outputs is the empty_bc_list.csv file, which contains 2,000 barcodes identified as background, not associated with cells. Users can manually add additional background barcodes if needed (for advanced users). For a detailed understanding of how this list is determined, refer to the BLAZE publication (You et al., 2023).
To incorporate the background barcode list, users should run FLAMES as they did previously, but include the background barcode list as the barcode_file parameter. There is no need to identify or quantify isoforms so users can set the following parameters to false in the config file
"do_isoform_identification": false,
"bambu_isoform_identification": false,
"do_read_realignment": false,
"do_transcript_quantification": false
Code
# Run FLAMES using background barcode list genrated in the first run.
<- sc_long_pipeline(fastq=fastq, outdir=output, annot=GTF, genome_bookdfa=genome,
sce #minimap=minimap_dir, k8=k8,
config_file=config_file,
expect_cell_number=1000, barcodes_file='path/to/emtpy_bc_list.csv')
10.2 Multisample QC function
Code
# Perform QC for each sample
<- function(count.matrix=C1_STC, min.features = 5000 ,
perform_qc_filtering max.features = 10000, max.counts = 100000,
min.counts = 3000, npc = 15, cluster_res = 0.9,
fig_name = '1', project = "2",
MT = 10, doublet_rate = 0.039) {
# Function to calculate min.features and max.features if not provided
<- function(nFeature_RNA) {
calculate_feature_range <- round(mean(nFeature_RNA) - (1.5 * sd(nFeature_RNA)))
min_feature <- round(mean(nFeature_RNA) + (1.5 * sd(nFeature_RNA)))
max_feature return(list(min_feature = min_feature, max_feature = max_feature))
}
# If min.features and max.features not provided, calculate them
if (is.null(min.features) || is.null(max.features)) {
<- CreateSeuratObject(counts = count.matrix, project = project)
seurat_obj_org <- calculate_feature_range(seurat_obj_org$nFeature_RNA)
feature_range <- feature_range$min_feature
min.features <- feature_range$max_feature
max.features
}
<- list()
rst_figures <- data.frame()
rst_table
##### remove features expressed in less than 1% of cells #####
# Calculate the percentage of cells expressing each gene
#gene_percent_expression <- base::rowMeans(count.matrix > 0) * 100
# Select genes expressed in at least 1% of cells
#genes_filter <- names(gene_percent_expression[gene_percent_expression > 1])
# Filter counts
#counts_sub <- count.matrix[genes_filter, ]
# Record the number of features removed
#removed_features <- dim(count.matrix)[1] - length(genes_filter)
#######
# Initialize Seurat object
<- CreateSeuratObject(counts = count.matrix,
seurat_object project = project)
<- FeatureScatter(seurat_object,
plot_scatter1 feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
geom_smooth(method = "lm") +
NoLegend() +
labs(title = "Association between reads and \nunique genes per cell BEFORE filtering")
plot(plot_scatter1)
# Add mitochondrial percentage
"joined"]] <- JoinLayers(seurat_object[["RNA"]])
seurat_object[["percent.mt"]] <- PercentageFeatureSet(seurat_object, pattern = "^MT-")
seurat_object[[
# Plot violin plots
<- VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))
p1 + plot_annotation(title = "QC plots (gene level) BEFORE Filtering")
p1
plot(p1)
# Remove low quality cells
<- subset(seurat_object,
filt_seurat_object subset = nFeature_RNA > min.features & nFeature_RNA < max.features & percent.mt < MT & nCount_RNA < max.counts & nCount_RNA > min.counts)
# Plot quality metrics after filtering
<- VlnPlot(filt_seurat_object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))
p2 + plot_annotation(title = "QC metrics gene level AFTER Filtering")
p2
plot(p2)
#######
# Normalize data
<- NormalizeData(filt_seurat_object,
filt_seurat_object normalization.method = "LogNormalize",
scale.factor = 10000)
# Identify highly variable features
<- FindVariableFeatures(filt_seurat_object,
filt_seurat_object selection.method = "vst",
nfeatures = 2000)
# Apply linear transformation
<- rownames(filt_seurat_object)
all_genes <- ScaleData(filt_seurat_object, features = all_genes)
filt_seurat_object
# Perform PCA
<- RunPCA(filt_seurat_object,
filt_seurat_object features = VariableFeatures(object = filt_seurat_object))
# Visualize PCA
<- append(rst_figures, ElbowPlot(filt_seurat_object))
rst_figures
# Cluster cells
<- FindNeighbors(filt_seurat_object, dims = 1:npc)
filt_seurat_object <- FindClusters(filt_seurat_object, resolution = cluster_res)
filt_seurat_object
# Perform UMAP
<- RunUMAP(filt_seurat_object, dims = 1:npc)
filt_seurat_object
### Filter out doublets (remember to modify doublet rate if samples have variable target cells)
## pK Identification (no ground-truth) ---------------------------------------------------------------------------------------
<- paramSweep(filt_seurat_object, PCs = 1:20, sct = FALSE)
sweep.res.list_pbmc <- summarizeSweep(sweep.res.list_pbmc, GT = FALSE)
sweep.stats_pbmc <- find.pK(sweep.stats_pbmc)
bcmvn_pbmc
<- bcmvn_pbmc %>% filter(BCmetric == max(BCmetric)) %>% dplyr::select(pK)
pK <- as.numeric(as.character(pK[[1]]))
pK
## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
<- filt_seurat_object@meta.data$seurat_clusters
annotations <- modelHomotypic(annotations)
homotypic.prop <- round(doublet_rate * nrow(filt_seurat_object@meta.data))
nExp_poi <- round(nExp_poi * (1 - homotypic.prop))
nExp_poi.adj
# Run doubletFinder
<- doubletFinder(filt_seurat_object,
filt_seurat_object PCs = 1:20,
pN = 0.25,
pK = pK,
nExp = nExp_poi.adj,
reuse.pANN = FALSE,
sct = FALSE)
colnames(filt_seurat_object@meta.data) <- sub("DF.classifications_.*$", "DF.classifications", colnames(filt_seurat_object@meta.data))
# Summary doublets
<- filt_seurat_object@meta.data %>%
statsDoublets group_by(DF.classifications) %>%
::summarize(Median_nCount_RNA = median(nCount_RNA),
dplyrMedian_nFeature_RNA = median(nFeature_RNA), Count = n())
# Visualize doublets
<- DimPlot(filt_seurat_object,
doublets reduction = 'umap',
group.by = "DF.classifications")
### i want to save the seurat object with doublets listed
<- filt_seurat_object
filt_seurat_object_doublets
<- subset(filt_seurat_object, subset = DF.classifications == 'Singlet')
filt_seurat_object
# figures
<- list(
ggplot_list ElbowPlot(filt_seurat_object) +
labs(title = 'SD explained by each PC') + theme(text = element_text(size = 10)),
FeatureScatter(filt_seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
geom_smooth(method = "lm") + NoLegend() +
labs(title = "Association between reads and \nunique genes per cell AFTER filtering"),
DimPlot(filt_seurat_object, reduction = "umap") +
labs(color = "Cluster \n(from PCA)", title = '') +
theme(text = element_text(size = 10)),
FeaturePlot(filt_seurat_object, reduction = "umap", features = 'nCount_RNA') +
labs(color = "UMI count", title = '') +
theme(text = element_text(size = 10)),
FeaturePlot(filt_seurat_object, reduction = "umap", features = 'nFeature_RNA') +
labs(color = str_wrap("Feature count (gene)", 15), title = '') +
theme(text = element_text(size = 10)),
p2
)
<- plot_grid(plotlist = ggplot_list, ncol = 3)
combined_plots
plot(combined_plots)
plot(DimPlot(filt_seurat_object_doublets, reduction = 'umap',
group.by = "DF.classifications"))
<- tableGrob(statsDoublets)
tbl_sts1 grid.newpage()
grid.draw(tbl_sts1)
# summary stats
<- rbind("Sample ID" = project,
stats_sumary "Cells_before_filter" = dim(seurat_object)[2],
"Cells_after_filter" = dim(filt_seurat_object)[2],
"Median Feature per Cell before filter" = median(seurat_object$nFeature_RNA),
"Median Reads per Gene before filter" = median(seurat_object$nCount_RNA),
"Median Feature per Cell" = median(filt_seurat_object$nFeature_RNA),
"Median Reads per Gene" = median(filt_seurat_object$nCount_RNA),
"Max Features" = max.features,
"Min Features" = min.features,
"Min Counts" = min.counts,
"Max Counts" = max.counts,
"MT Percentage" = MT,
"NPCs" = npc,
"Median Percent MT before Filter" = median(seurat_object@meta.data[["percent.mt"]]),
"Median Percent MT after Filter" = median(filt_seurat_object@meta.data[["percent.mt"]])
)
<- tableGrob(stats_sumary)
tbl_sts2 grid.newpage()
grid.draw(tbl_sts2)
list(filt_seurat_object,
statsDoublets,
stats_sumary,
filt_seurat_object_doublets) }
10.3 Convert Oarfish files to count matrix
This is a function to take all oarfish files from FLAMES multisample output folder and convert the files into a gene count matrix that contains gene symbol ids instead of ENSGIDs. To use this fucntion ensure that the isoform_gene_dict.csv
file has been genrated as decibed in ??
Code
### readSeurat### read in oarfish count files and add them to seurat objects ###
<- function(sample_name, resource_table_path, output_dir, input_dir) {
process_oarfish_files_to_counts_matrix
# Load required libraries
library(Matrix)
library(dplyr)
# Read in the resource table (transcript_id, gene_id, gene_symbol)
<- fread(resource_table_path)
combined_data
# Define the file paths based on the sample name
# Construct file paths
<- file.path(input_dir, paste0(sample_name, ".count.mtx"))
count_matrix_path <- file.path(input_dir, paste0(sample_name, ".barcodes.txt"))
barcodes_path <- file.path(input_dir, paste0(sample_name, ".features.txt"))
features_path
# Read the data
<- readMM(count_matrix_path)
counts <- readLines(barcodes_path)
barcodes <- read.delim(features_path, header = FALSE)
features
# Transpose the matrix if needed (for Seurat compatibility)
<- t(counts)
counts
# Set row and column names
rownames(counts) <- features$V1
colnames(counts) <- barcodes
# Convert to a data frame
<- as.data.frame(counts)
counts_df
# Add transcript_id as the first column
$transcript_id <- rownames(counts_df)
counts_df<- counts_df[, c(ncol(counts_df), 1:(ncol(counts_df)-1))]
counts_df
# Merge with the resource table to add gene symbols
<- counts_df %>%
df_genesymbol left_join(combined_data, by = "transcript_id")
# Remove the gene_id column and reorder the columns
$gene_id <- NULL
df_genesymbol<- df_genesymbol[, c(ncol(df_genesymbol), 1:(ncol(df_genesymbol)-1))]
df_genesymbol
# Update row names to include gene symbol instead of transcript_id
rownames(df_genesymbol) <- paste0(df_genesymbol$transcript_id, "_", df_genesymbol$gene_symbol)
$transcript_id <- NULL
df_genesymbol$gene_symbol <- NULL
df_genesymbol
# Write the output to a CSV file
<- file.path(output_dir, paste0("gene_symbol_oarfish_", sample_name, "_counts.csv"))
output_path fwrite(df_genesymbol, output_path, row.names = TRUE)
cat("Processed sample:", sample_name, "\nOutput saved to:", output_path, "\n")
}
# Example usage:
# List of sample names
<- c("C1_STC")
sample_names
#create a resource
# Call the helper function defined in code block above to create a dictionary containing corresponding gene information for each isoform
# This may take a few minutes
# The FLAMES ref can be found in your selected output folder after running the Flames pipeline.
<- "./data/muti_sample/isoform_annotated.gtf" #ensure file is unzipped
FLAMES_gtf_file <- "./data/gencode.v47.annotation.gtf" # ensure file is unzipped
reference_gtf_file <- "multi_isoform_gene_dict.csv"
output_file
<- make_isoform_gene_symbol_dict(FLAMES_gtf_file,
isoform_gene_dict
reference_gtf_file,
output_file)# run this in the output FLAMES dir
# Loop through each sample and process the count files
for (sample in sample_names) {
# Use tryCatch to handle errors
tryCatch({
# Call the function to process the sample
process_oarfish_files_to_counts_matrix(
sample_name = sample,
resource_table_path = "./output_files/ref_files/multi_isoform_gene_dict.csv",
output_dir = "./output_files/mutli_sample/oarfish_counts",
input_dir = "./output_files/mutli_sample/oarfish_counts"
)error = function(e) {
}, # Print the error message and exit the loop
cat("Error processing sample:", sample, "\nError message:", e$message, "\nExiting loop.\n")
stop(e)
}) }