Chapter 2 Setup
2.1 load in required packages
Code
#install packages if required. Note some packages require installation via bioconductor. See installation instruction for each package to ensure installation is successful.
library(rtracklayer)
library(Seurat)
library(DropletUtils)
library(gridExtra)
library(data.table)
library(BiocParallel)
library(celda)
library(SingleCellExperiment)
library(DoubletFinder)
library(stringr)
library(cowplot)
library(grid)
library(patchwork)
library(tidyverse)
library(ORFik)
library(GenomicFeatures)
library(Gviz)
library(BSgenome.Hsapiens.UCSC.hg38)
# Set working directory and create folders for output files
setwd(".") # Set this to correct location
dir.create("./output_files/ref_files", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/counts", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/seu_objects", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/empty_drops", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/decontx", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/QC", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/DE", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/mutli_sample", recursive = TRUE, showWarnings = FALSE)
2.2 Creating resource files.
To start, we’ll create a few essential files that will be used throughout the analysis. The first step is to generate a CSV file containing three key columns: ENSTID, ENSGID, and geneSymbol. This file will be used as a dictionary to rename entries in both the isoform and gene count matrices, replacing ENSGID with the corresponding gene symbol. By adopting this naming convention for ENSTID, we can easily identify the gene origin of each isoform, streamlining the interpretation and analysis of gene and isoform-level data.
First lets define a helper function for this step:
Code
# Function to make csv naming resource
<- function(FLAMES_gtf,
make_isoform_gene_symbol_dict
reference_gtf,
output_file) {# Import the first GTF file (transcripts GTF)
<- import(FLAMES_gtf)
gtf1 <- as.data.frame(gtf1)
gtf1_df
# Select relevant columns from the first GTF
<- gtf1_df[, c("transcript_id", "gene_id")]
selected_columns1 <- unique(selected_columns1)
unique_selected_cols
# Import the second GTF file (reference GTF with gene symbols)
<- import(reference_gtf)
gtf2 <- as.data.frame(gtf2)
gtf2_df
# Select relevant columns from the second GTF
<- gtf2_df[, c("gene_name", "gene_id")]
selected_columns2 <- unique(selected_columns2)
unique_gene_symbol
# Merge the two data frames on 'gene_id'
<- merge(unique_selected_cols,
combined_data
unique_gene_symbol, by = "gene_id",
all.x = TRUE)
# If 'gene_name' is missing, replace it with 'gene_id'
$gene_symbol <- ifelse(is.na(combined_data$gene_name),
combined_data$gene_id,
combined_data$gene_name)
combined_data
# Select relevant columns
<- combined_data[, c("transcript_id", "gene_id", "gene_symbol")]
final_combined_data
# Write to a CSV file
write.csv(final_combined_data, file = file.path("output_files/ref_files", output_file), row.names = FALSE)
return(final_combined_data)
}
Run this chunk to create the dictionary containing ENSTID, ENSGID, and geneSymbol information:
Code
# The FLAMES ref can be found in your selected output folder after running the Flames pipeline.
<- "./data/FLAMES_out/isoform_annotated.gtf" #ensure file is unzipped
FLAMES_gtf_file <- "data/gencode.v47.annotation.gtf" # ensure file is unzipped
reference_gtf_file <- "isoform_gene_dict.csv"
output_file
# Call the helper function defined in code block above to create a dictionary containing corresponding gene information for each isoform
# This may take a few minutes
<- make_isoform_gene_symbol_dict(FLAMES_gtf_file,
isoform_gene_dict
reference_gtf_file, output_file)
2.3 Convert count matrices from Gene ID to gene Symbol
With the reference dictionary in place, we can now rename both our count matrix and background count matrix by converting ENSGIDs to geneSymbols. This conversion not only simplifies the interpretation of gene expression in single cells but is also necessary for some downstream tools that require gene symbols instead of ENSGIDs, such as automated cell annotation tools.
Like before, lets define a generic helper function first to do this:
Code
<- function(gene_count_matrix_path,
convert_ENSGID_to_geneSymbol id_symbol_df = isoform_gene_dict,
output_file,return_df = FALSE) {
# Load the reference dictionary we made earlier - select gene-level cols
<- as_tibble(id_symbol_df) %>%
id_symbol_df ::select(gene_id, gene_symbol)
dplyr
# Load the data object with ENSGID row names
<- fread(gene_count_matrix_path, header = TRUE)
gene_count_matrix colnames(gene_count_matrix)[1] <- "gene_id"
# Replace ENSGIDs with gene symbols in original flames gene-level count matrix
<- gene_count_matrix %>%
formatted_gene_count_matrix merge(id_symbol_df, by.x = 'gene_id', by.y = 'gene_id') %>% # Add gene symbol information
distinct(gene_symbol, .keep_all = TRUE) %>% # Remove duplicates based on gene symbol
::select(-gene_id) %>% # Remove the ENSGID column
dplyrcolumn_to_rownames(var = "gene_symbol") # use the gene symbols we added as rownames
# Write out the processed data frame
fwrite(formatted_gene_count_matrix,
output_file, row.names = TRUE)
# Return the processed count matrix for further use if needed
if(return_df){
return(formatted_gene_count_matrix)
} }
Run the chunk below to format gene-level count matrices for background and FLAMES data using the helper function from above:
Code
# convert Gene_id to gene symbol for gene counts
convert_ENSGID_to_geneSymbol(
gene_count_matrix_path = "./data/FLAMES_out/gene_count.csv",
output_file = "./output_files/counts/geneSymbol_gene_count.csv"
)
# convert Gene_id to gene symbol for background counts
convert_ENSGID_to_geneSymbol(
gene_count_matrix_path = "./data/background/gene_count.csv",
output_file = "./output_files/counts/background_geneSymbol_gene_count.csv"
)
Now we have the files we need to begin cleaning our data and removing unwanted noise.