Chapter 2 Setup


2.1 load in required packages

Code
#install packages if required. Note some packages require installation via bioconductor. See installation instruction for each package to ensure installation is successful. 
library(rtracklayer)
library(Seurat)
library(DropletUtils)
library(gridExtra)
library(data.table)
library(BiocParallel)
library(celda)
library(SingleCellExperiment)
library(DoubletFinder)
library(stringr)
library(cowplot)
library(grid)
library(patchwork)
library(tidyverse)
library(ORFik)
library(GenomicFeatures)
library(Gviz)
library(BSgenome.Hsapiens.UCSC.hg38)

# Set working directory and create folders for output files
setwd(".")  # Set this to correct location
dir.create("./output_files/ref_files", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/counts", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/seu_objects", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/empty_drops", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/decontx", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/QC", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/DE", recursive = TRUE, showWarnings = FALSE)
dir.create("./output_files/mutli_sample", recursive = TRUE, showWarnings = FALSE)

2.2 Creating resource files.

To start, we’ll create a few essential files that will be used throughout the analysis. The first step is to generate a CSV file containing three key columns: ENSTID, ENSGID, and geneSymbol. This file will be used as a dictionary to rename entries in both the isoform and gene count matrices, replacing ENSGID with the corresponding gene symbol. By adopting this naming convention for ENSTID, we can easily identify the gene origin of each isoform, streamlining the interpretation and analysis of gene and isoform-level data.

First lets define a helper function for this step:

Code
# Function to make csv naming resource 
make_isoform_gene_symbol_dict <- function(FLAMES_gtf, 
                                          reference_gtf, 
                                          output_file) {
  # Import the first GTF file (transcripts GTF)
  gtf1 <- import(FLAMES_gtf)
  gtf1_df <- as.data.frame(gtf1)
  
  # Select relevant columns from the first GTF
  selected_columns1 <- gtf1_df[, c("transcript_id", "gene_id")]
  unique_selected_cols <- unique(selected_columns1)
  
  # Import the second GTF file (reference GTF with gene symbols)
  gtf2 <- import(reference_gtf)
  gtf2_df <- as.data.frame(gtf2)
  
  # Select relevant columns from the second GTF
  selected_columns2 <- gtf2_df[, c("gene_name", "gene_id")]
  unique_gene_symbol <- unique(selected_columns2)
  
  # Merge the two data frames on 'gene_id'
  combined_data <- merge(unique_selected_cols, 
                         unique_gene_symbol, 
                         by = "gene_id", 
                         all.x = TRUE)
  
  # If 'gene_name' is missing, replace it with 'gene_id'
  combined_data$gene_symbol <- ifelse(is.na(combined_data$gene_name), 
                                      combined_data$gene_id, 
                                      combined_data$gene_name)
  
  # Select relevant columns
  final_combined_data <- combined_data[, c("transcript_id", "gene_id", "gene_symbol")]
  
  # Write to a CSV file
    write.csv(final_combined_data, file = file.path("output_files/ref_files", output_file), row.names = FALSE)

  
  return(final_combined_data)
}

Run this chunk to create the dictionary containing ENSTID, ENSGID, and geneSymbol information:

Code
# The FLAMES ref can be found in your selected output folder after running the Flames pipeline. 
FLAMES_gtf_file <- "./data/FLAMES_out/isoform_annotated.gtf" #ensure file is unzipped
reference_gtf_file <- "data/gencode.v47.annotation.gtf" # ensure file is unzipped
output_file <- "isoform_gene_dict.csv"

# Call the helper function defined in code block above to create a dictionary containing corresponding gene information for each isoform
# This may take a few minutes 
isoform_gene_dict <- make_isoform_gene_symbol_dict(FLAMES_gtf_file,
                                                   reference_gtf_file,
                                                   output_file)

2.3 Convert count matrices from Gene ID to gene Symbol

With the reference dictionary in place, we can now rename both our count matrix and background count matrix by converting ENSGIDs to geneSymbols. This conversion not only simplifies the interpretation of gene expression in single cells but is also necessary for some downstream tools that require gene symbols instead of ENSGIDs, such as automated cell annotation tools.

Like before, lets define a generic helper function first to do this:

Code
convert_ENSGID_to_geneSymbol <- function(gene_count_matrix_path, 
                                         id_symbol_df = isoform_gene_dict, 
                                         output_file,
                                         return_df = FALSE) {
  
  # Load the reference dictionary we made earlier - select gene-level cols
  id_symbol_df <- as_tibble(id_symbol_df) %>%
    dplyr::select(gene_id, gene_symbol)
  
  # Load the data object with ENSGID row names
  gene_count_matrix <- fread(gene_count_matrix_path, header = TRUE)
  colnames(gene_count_matrix)[1] <- "gene_id"
  
  # Replace ENSGIDs with gene symbols in original flames gene-level count matrix
  formatted_gene_count_matrix <- gene_count_matrix %>%
    merge(id_symbol_df, by.x = 'gene_id', by.y = 'gene_id') %>%   # Add gene symbol information
    distinct(gene_symbol, .keep_all = TRUE) %>%   # Remove duplicates based on gene symbol
    dplyr::select(-gene_id) %>%   # Remove the ENSGID column
    column_to_rownames(var = "gene_symbol")   # use the gene symbols we added as rownames
  
  # Write out the processed data frame
  fwrite(formatted_gene_count_matrix, 
            output_file, 
            row.names = TRUE)
  
  # Return the processed count matrix for further use if needed
  if(return_df){
    return(formatted_gene_count_matrix)
  }
}

Run the chunk below to format gene-level count matrices for background and FLAMES data using the helper function from above:

Code
# convert Gene_id to gene symbol for gene counts
convert_ENSGID_to_geneSymbol(
  gene_count_matrix_path = "./data/FLAMES_out/gene_count.csv",
  output_file = "./output_files/counts/geneSymbol_gene_count.csv"
)

# convert Gene_id to gene symbol for background counts
convert_ENSGID_to_geneSymbol(
  gene_count_matrix_path = "./data/background/gene_count.csv",
  output_file = "./output_files/counts/background_geneSymbol_gene_count.csv"
)

Now we have the files we need to begin cleaning our data and removing unwanted noise.