I hope you are doing well.
I am currently working with your package to analyse the class-switch of a sample from which we have single-cell RNAseq and VDJ sequencing data. In order to do so, I am following your tutorial in "https://josephng-bio.org/sciCSR/articles/csr.html", which is very clarifying. However, I am struggling when running the function convertSeuratToH5ad(). This is something that has previously been brought up when using the Seurat package (mojaveazure/seurat-disk#30). Although we tried using different combinations of R and package versions as suggested (mojaveazure/seurat-disk#30 (comment)), we have not been able to solve this issue.
Would you happen to know how to solve this, or else the reason why is this occurring? If so, would you be so kind to share this information? We would be really interested in analyzing the class switch of this specific sample but would also like to expand the analysis to other samples as well.
Thank you so much in advance. Please let me know if you were to require any other information from my side.
library(airr)
library(Seurat)
#######################################
# INTEGRATE scBCR-SEQ REPERTOIRE DATA #
#######################################
HC2 = Seurat::Read10X(data.dir = "/bicoh/MARGenomics/20230227_Giuliana_scRNASeq_BCR/Analysis/Sequencing_data/HC_2/HC_2/outs/per_sample_outs/HC_2/count/sample_filtered_feature_bc_matrix")
HC2 = CreateSeuratObject(HC2)
[email protected]$sample_id = "HC2"
HC2 = NormalizeData(HC2, normalization.method = "LogNormalize", scale.factor = 10000)
HC2 = FindVariableFeatures(HC2, selection.method="vst", nfeatures=2000)
HC2 = ScaleData(HC2)
HC2 = RunPCA(HC2, features=VariableFeatures(HC2))
pct = HC2[["pca"]]@stdev / sum(HC2[["pca"]]@stdev) * 100
cumu = cumsum(pct)
co1 = which(cumu > 90 & pct < 5)[1] # 41
co2 = sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 # 17
co3 = min(co1, co2) # co3 = 17
HC2 = RunUMAP(HC2, dims=1:co3)
HC2 = FindNeighbors(HC2, dims=1:co3)
HC2 = FindClusters(HC2, resolution = 0.8)
# This is the filtered_contig_annotations.csv file from cellranger vdj
vdj <- read.csv(file.path(wd, "Sequencing_data/HC_2/HC_2/outs/per_sample_outs/HC_2/vdj_b/filtered_contig_annotations.csv"), stringsAsFactors = FALSE)
# Read AIRR-format output from IMGT/HighV-Quest
vquest = read_rearrangement(file.path(wd, "Results/BCR_analysis/filtered_contig_igblast_HC_2_db-pass.tsv"))
# The % sequence identity to germline V allele is the column 'v_identity'
# We want to merge this column into cellranger vdj output
# using the sequence IDs as keys
vdj <- merge(vdj, vquest[, c("sequence_id", "v_identity")],
by.x = "contig_id", by.y = "sequence_id",
all.x = TRUE, all.y = FALSE, sort = FALSE)
# collapseBCR reduces the input data such that for each cell, at most 1 H and 1 L sequences are retained for merging into the Seurat object. The full table without removing those sequences can also be accessed, by indicating full.table = TRUE when calling collapseBCR.
collapsed <- collapseBCR(vdj, format = "10X")
# First make sure we have a column in the vdj table which corresponds to
# sample names - this column must be called 'sample_name'
# In this case it is the column 'donor' - first rename this to 'sample_name'
collapsed$sample_name = "HC2"
HC2 <- combineBCR(
collapsed, HC2,
# list the columns from vdj you wish to add to the Seurat object down here
keep_columns = c("v_gene", "d_gene", "j_gene", "c_gene",
"v_identity", "full_length",
"productive", "cdr3", "cdr3_nt",
"reads", "umis")
)
DimPlot(HC2, group.by = "IGH_c_gene")
# there is a column 'bcr_type' added to the Seurat object which indicates
# whether the cell is singlet/doublet etc. cells without BCR transcripts will
# be labelled NA
HC2 <- HC2[, Cells(HC2)[which(!is.na(HC2$bcr_type))]] # 10541 cells -> 2354
# subset to retain those cells positive for both CD19 and CD20 transcripts
# HC2 <- subset(HC2, subset = (MS4A1 > 0 & CD19 > 0)) # 2354 -> 101 WE DO NOT USE THIS AS WE KEEP SO FEW CELLS
# Since we have subset the data, rerun FindNeighbors
HC2 <- FindNeighbors(HC2)
###############################################################
# ENUMERATE PRODUCTIVE AND STERILE IG HEAVY CHAIN TRANSCRIPTS #
###############################################################
# Run in SHIVA
# sample = file.path(wd, "Sequencing_data/FASTQ/HC_2/possorted_genome_bam_sorted.bam")
# example = system.file("extdata/Hong_S1_sampled_Igh.bam", package = "sciCSR")
#
# data("human_definitions") # load the genomic coordinates of the V, D and J genes
# data("mouse_definitions")
#
# # Replace "14" by "chr14" in order for the bam and the definitions file to match. Otherwise it will fail?
#
# library(GenomeInfoDb)
# for(i in seq_along(human_definitions)) {
# seqlevels(human_definitions[[i]]) <- sub("^14$", "chr14", seqlevels(human_definitions[[i]]))
# }
#
# out <- getIGHmapping(sample, human_definitions, cellBarcodeTag = "CB", umiTag = "UB",
# paired = FALSE, flank = 5000) #lo q falla és el paired (funció scanBam)
# out2 <- getIGHreadType(out$read_count)
# out3 <- summariseIGHreads(out2, human_definitions)
out3 <- readRDS(file.path(results_dir, "getIGHmapping_out3.rds"))
# HC2_repaired = repairBarcode(out3, HC2, seurat_sample_column = "sample_id") # only needed when >1 sample
# HC2_repaired <- do.call("rbind", HC2_repaired) # only needed when >1 sample
# remove cells which are not in the Seurat object but happen to have
# observed productive/sterile transcripts
out3_clean <- out3[which(rownames(out3) %in% Cells(HC2)), ] # from 40286 to 2354/101
# Add the info of productive/sterile count matrix to the Seurat object. We add the variables "nCount_IGHC" and "nFeature_IGHC"
HC2 <- mergeIgHCountsToSeurat(
igh_counts = out3_clean,
SeuratObj = HC2,
# We will store the productive/sterile count matrix as a new
# assay called 'IGHC'
assay_name = "IGHC"
)
# Check expression in some cells
slot(HC2[["IGHC"]], "counts")[1:9, 1:3]
# We can see for each Igh C gene, there are three separate counts:
# ‘-S’: these correspond to the sterile transcripts;
# ‘-P’: these correspond to the productive transcripts;
# ‘-C’: these corresponds to transcripts which cannot be classified as sterile or productive (e.g. only 1 read mapping to the coding exons are found, in this case it can be either sterile or productive, we do not have the confidence to assign this to either category).
# Normalize data
HC2 <- NormalizeData(HC2, assay = "IGHC" )
genes <- rownames(HC2[["IGHC"]])
# Let's say we want to plot only the sterile and productive counts
genes <- genes[which(grepl("-[SP]$", genes))]
DotPlot(HC2, assay = "IGHC", features = genes) +
RotatedAxis()
###################################################
# INFER TRANSITIONS USING CSR AND SHM INFORMATION #
###################################################
# calculate CSR potential
HC2 <- getCSRpotential(
SeuratObj = HC2,
# Here we use the isotype information from the merged
# repertoire annotations. Column 'IGH_c_gene' stores
# this information
c_gene_anno_name = "IGH_c_gene",
# Specify this is a mouse dataset. Change to 'human'
# if you work on human data
reference_based = "human"
)
# get SHM frequency (= 1 - IGH_v_identity)
# we have merged the germline identity information from the
# repertoire to the Seurat object (column 'IGH_v_identity')
HC2 <- getSHM(HC2, v_identity_anno_name = "IGH_v_identity")
saveRDS(HC2, file = file.path(results_dir, "HC2_seurat.rds"))
convertSeuratToH5ad(HC2, assays = c("RNA"), "HC2_Bcells.h5ad")
> sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS
Matrix products: default
BLAS/LAPACK: /soft/system/software/OpenBLAS/0.3.12-GCC-10.2.0/lib/libopenblas_prescottp-r0.3.12.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=es_ES.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] airr_1.5.0 kableExtra_1.3.4 knitr_1.45 openxlsx2_1.2 sciCSR_0.3.1 reticulate_1.34.0
[7] S4Vectors_0.36.2 BiocGenerics_0.44.0 Matrix_1.6-4 Seurat_5.0.1 SeuratObject_5.0.1 sp_2.1-2
[13] edgeR_3.40.2 limma_3.54.2
loaded via a namespace (and not attached):
[1] utf8_1.2.4 spatstat.explore_3.2-5 R.utils_2.12.3 tidyselect_1.2.0
[5] htmlwidgets_1.6.4 grid_4.2.1 BiocParallel_1.32.6 Rtsne_0.17
[9] munsell_0.5.0 codetools_0.2-18 ica_1.0-3 future_1.33.1
[13] miniUI_0.1.1.1 withr_2.5.2 spatstat.random_3.2-2 colorspace_2.1-0
[17] progressr_0.14.0 Biobase_2.58.0 rstudioapi_0.15.0 ROCR_1.0-11
[21] tensor_1.5 listenv_0.9.0 labeling_0.4.3 MatrixGenerics_1.10.0
[25] GenomeInfoDbData_1.2.9 harmony_1.2.0 polyclip_1.10-6 farver_2.1.1
[29] bit64_4.0.5 parallelly_1.36.0 vctrs_0.6.5 generics_0.1.3
[33] xfun_0.41 R6_2.5.1 GenomeInfoDb_1.34.9 locfit_1.5-9.8
[37] hdf5r_1.3.5 bitops_1.0-7 spatstat.utils_3.0-4 DelayedArray_0.24.0
[41] markovchain_0.9.5 vroom_1.6.5 promises_1.2.1 scales_1.3.0
[45] gtable_0.3.4 globals_0.16.2 goftest_1.2-3 spam_2.10-0
[49] rlang_1.1.2 systemfonts_1.0.4 splines_4.2.1 lazyeval_0.2.2
[53] spatstat.geom_3.2-7 yaml_2.3.8 reshape2_1.4.4 abind_1.4-5
[57] httpuv_1.6.13 SeuratDisk_0.0.0.9021 tools_4.2.1 ggplot2_3.4.4
[61] ellipsis_0.3.2 RColorBrewer_1.1-3 ggridges_0.5.5 Rcpp_1.0.12
[65] plyr_1.8.9 zlibbioc_1.44.0 purrr_1.0.2 RCurl_1.98-1.14
[69] deldir_2.0-2 pbapply_1.7-2 cowplot_1.1.2 zoo_1.8-12
[73] SummarizedExperiment_1.28.0 ggrepel_0.9.4 cluster_2.1.4 magrittr_2.0.3
[77] data.table_1.14.10 RSpectra_0.16-1 scattermore_1.2 lmtest_0.9-40
[81] RANN_2.6.1 fitdistrplus_1.1-11 matrixStats_1.2.0 hms_1.1.3
[85] patchwork_1.2.0 mime_0.12 evaluate_0.23 xtable_1.8-4
[89] fastDummies_1.7.3 IRanges_2.32.0 gridExtra_2.3 compiler_4.2.1
[93] tibble_3.2.1 KernSmooth_2.23-20 crayon_1.5.2 R.oo_1.25.0
[97] htmltools_0.5.7 later_1.3.2 tzdb_0.4.0 tidyr_1.3.0
[101] expm_0.999-8 RcppParallel_5.1.7 MASS_7.3-58.1 rappdirs_0.3.3
[105] readr_2.1.4 cli_3.6.2 R.methodsS3_1.8.2 parallel_4.2.1
[109] dotCall64_1.1-1 igraph_1.6.0 GenomicRanges_1.50.2 pkgconfig_2.0.3
[113] GenomicAlignments_1.32.1 philentropy_0.8.0 plotly_4.10.3 spatstat.sparse_3.0-3
[117] xml2_1.3.6 svglite_2.1.3 webshot_0.5.3 XVector_0.38.0
[121] rvest_1.0.3 stringr_1.5.1 digest_0.6.33 sctransform_0.4.1
[125] RcppAnnoy_0.0.21 spatstat.data_3.0-3 Biostrings_2.66.0 rmarkdown_2.25
[129] leiden_0.4.3.1 uwot_0.1.16 shiny_1.8.0 Rsamtools_2.14.0
[133] lifecycle_1.0.4 nlme_3.1-159 jsonlite_1.8.8 viridisLite_0.4.2
[137] fansi_1.0.6 pillar_1.9.0 lattice_0.20-45 fastmap_1.1.1
[141] httr_1.4.7 survival_3.4-0 glue_1.6.2 zip_2.3.0
[145] png_0.1-8 bit_4.0.5 stringi_1.8.3 nnls_1.5
[149] RcppHNSW_0.5.0 dplyr_1.1.4 irlba_2.3.5.1 future.apply_1.11.1