# Load the raw filtered_gene_bc_matrix outputed by Cell Ranger v2.1.1
Countdata <- Read10X(data.dir = "./1-QualityControl/Rawdata")
# Initialize the Seurat object
Raw.data <- CreateSeuratObject(raw.data = Countdata,
min.cells = 3,
min.genes = 800,
project = "E12.5-PSB-WT")
Raw.data@meta.data$Barcodes <- rownames(Raw.data@meta.data)
# Percent of mitochondrial counts
mito.genes <- grep(pattern = "^mt-", x = rownames(x = Raw.data@data), value = TRUE)
percent.mito <- Matrix::colSums(Raw.data@raw.data[mito.genes, ])/Matrix::colSums(Raw.data@raw.data)
Raw.data <- AddMetaData(object = Raw.data, metadata = percent.mito, col.name = "percent.mito")
# Percent of mitochondrial ribosomal
ribo.genes <- grep(pattern = "(^Rpl|^Rps|^Mrp)", x = rownames(x = Raw.data@data), value = TRUE)
percent.ribo <- Matrix::colSums(Raw.data@raw.data[ribo.genes, ])/Matrix::colSums(Raw.data@raw.data)
Raw.data <- AddMetaData(object = Raw.data, metadata = percent.ribo, col.name = "percent.ribo")
#Violin plot
VlnPlot(object = Raw.data, features.plot = c("nGene","nUMI", "percent.mito", "percent.ribo"), nCol = 2 )
# Relation between nUMI and nGene detected
p1 <- ggplot(Raw.data@meta.data, aes(x=nUMI, y=nGene)) + geom_point() + geom_smooth(method="lm")
p1 <- ggMarginal(p1, type = "histogram", fill="lightgrey")
p2 <- ggplot(Raw.data@meta.data, aes(x=log10(nUMI), y=log10(nGene))) + geom_point() + geom_smooth(method="lm")
p2 <- ggMarginal(p2, type = "histogram", fill="lightgrey")
plot_grid(plotlist = list(p1,p2), ncol=2, align='h', rel_widths = c(1, 1))
We applied a high and low median absolute deviation (mad) thresholds to exclude outlier cells
max.mito.thr <- median(Cell.QC.Stat$percent.mito) + 3*mad(Cell.QC.Stat$percent.mito)
min.mito.thr <- median(Cell.QC.Stat$percent.mito) - 3*mad(Cell.QC.Stat$percent.mito)
p1 <- ggplot(Cell.QC.Stat, aes(x=nGene, y=percent.mito)) +
geom_point() +
geom_hline(aes(yintercept = max.mito.thr), colour = "red", linetype = 2) +
geom_hline(aes(yintercept = min.mito.thr), colour = "red", linetype = 2) +
annotate(geom = "text", label = paste0(as.numeric(table(Cell.QC.Stat$percent.mito > max.mito.thr | Cell.QC.Stat$percent.mito < min.mito.thr)[2])," cells removed\n",
as.numeric(table(Cell.QC.Stat$percent.mito > max.mito.thr | Cell.QC.Stat$percent.mito < min.mito.thr)[1])," cells remain"), x = 6000, y = 0.1)
ggMarginal(p1, type = "histogram", fill="lightgrey", bins=100)
We filter cells which are likely to be doublet based on their higher content of transcript detected as well as cell with to few genes/UMI sequenced
# Set low and hight thresholds on the number of detected genes
min.Genes.thr <- median(log10(Cell.QC.Stat$nGene)) - 3*mad(log10(Cell.QC.Stat$nGene))
max.Genes.thr <- median(log10(Cell.QC.Stat$nGene)) + 3*mad(log10(Cell.QC.Stat$nGene))
# Set hight threshold on the number of transcripts
max.nUMI.thr <- median(log10(Cell.QC.Stat$nUMI)) + 3*mad(log10(Cell.QC.Stat$nUMI))
# Gene/UMI scatter plot before filtering
p1 <- ggplot(Cell.QC.Stat, aes(x=log10(nUMI), y=log10(nGene))) +
geom_point() +
geom_smooth(method="lm") +
geom_hline(aes(yintercept = min.Genes.thr), colour = "green", linetype = 2) +
geom_hline(aes(yintercept = max.Genes.thr), colour = "green", linetype = 2) +
geom_vline(aes(xintercept = max.nUMI.thr), colour = "red", linetype = 2)
ggMarginal(p1, type = "histogram", fill="lightgrey")
# Filter cells base on both metrics
Cell.QC.Stat <- Cell.QC.Stat %>% filter(log10(nGene) > min.Genes.thr) %>% filter(log10(nUMI) < max.nUMI.thr)
lm.model <- lm(data = Cell.QC.Stat, formula = log10(nGene) ~ log10(nUMI))
p2 <- ggplot(Cell.QC.Stat, aes(x=log10(nUMI), y=log10(nGene))) +
geom_point() +
geom_smooth(method="lm") +
geom_hline(aes(yintercept = min.Genes.thr), colour = "green", linetype = 2) +
geom_hline(aes(yintercept = max.Genes.thr), colour = "green", linetype = 2) +
geom_vline(aes(xintercept = max.nUMI.thr), colour = "red", linetype = 2) +
geom_abline(intercept = lm.model$coefficients[1] - 0.09 , slope = lm.model$coefficients[2], color="orange") +
annotate(geom = "text", label = paste0(dim(Cell.QC.Stat)[1], " QC passed cells"), x = 4, y = 3.8)
ggMarginal(p2, type = "histogram", fill="lightgrey")
# Cells to exclude lie below an intersept offset of -0.09
Cell.QC.Stat$valideCells <- log10(Cell.QC.Stat$nGene) > (log10(Cell.QC.Stat$nUMI) * lm.model$coefficients[2] + (lm.model$coefficients[1] - 0.09))
p3 <- ggplot(Cell.QC.Stat, aes(x=log10(nUMI), y=log10(nGene))) +
geom_point(aes(colour = valideCells)) +
geom_smooth(method="lm") +
geom_abline(intercept = lm.model$coefficients[1] - 0.09 , slope = lm.model$coefficients[2], color="orange") +
theme(legend.position="none") +
annotate(geom = "text", label = paste0(as.numeric(table(Cell.QC.Stat$valideCells)[2]), " QC passed cells\n",
as.numeric(table(Cell.QC.Stat$valideCells)[1]), " QC filtered"), x = 4, y = 3.8)
ggMarginal(p3, type = "histogram", fill="lightgrey")
# Plot final QC metrics
VlnPlot(object = Raw.data, features.plot = c("nGene","nUMI", "percent.mito", "percent.ribo"), nCol = 2 )
Export raw count matrix as input to Scrublet
#Export filtered matrix
dir.create("./1-QualityControl/QCdata")
exprData <- Matrix(as.matrix(Raw.data@raw.data), sparse = TRUE)
writeMM(exprData, "./1-QualityControl/QCdata/matrix.mtx")
## NULL
import scrublet as scr
import scipy.io
import numpy as np
import os
#Load raw counts matrix and gene list
input_dir = '/home/matthieu/Bureau/R/FiguresCodes/1-QualityControl'
counts_matrix = scipy.io.mmread(input_dir + '/QCdata/matrix.mtx').T.tocsc()
#Initialize Scrublet object
scrub = scr.Scrublet(counts_matrix,
expected_doublet_rate=0.1,
sim_doublet_ratio=2,
n_neighbors = 8)
#Run the default pipeline
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=1,
min_cells=3,
min_gene_variability_pctl=85,
n_prin_comps=25)
## Preprocessing...
## Simulating doublets...
## Embedding transcriptomes using PCA...
## Calculating doublet scores...
## Automatically set threshold at doublet score = 0.23
## Detected doublet rate = 3.9%
## Estimated detectable doublet fraction = 70.6%
## Overall doublet rate:
## Expected = 10.0%
## Estimated = 5.5%
## Elapsed time: 5.0 seconds
# Import scrublet's doublet score
Raw.data@meta.data$Doubletscore <- py$doublet_scores
# Plot doublet score
ggplot(Raw.data@meta.data, aes(x = Doubletscore, stat(ndensity))) +
geom_histogram(bins = 200, colour ="lightgrey")+
geom_vline(xintercept = 0.23, colour = "red", linetype = 2)+
geom_vline(xintercept = 0.15, colour = "green", linetype = 2) # Manually set threshold
# Manually set threshold at doublet score to 0.15
Raw.data@meta.data$Predicted_doublets <- ifelse(py$doublet_scores > 0.15, "Doublet","Singlet" )
table(Raw.data@meta.data$Predicted_doublets)
##
## Doublet Singlet
## 282 4234
# Filter genes expressed by less than 3 cells
num.cells <- Matrix::rowSums(Raw.data@data > 0)
genes.use <- names(x = num.cells[which(x = num.cells >= 3)])
Raw.data@raw.data <- Raw.data@raw.data[genes.use, ]
Raw.data@data <- Raw.data@data[genes.use, ]
# logNormalized the gene expression matrix
Raw.data <- NormalizeData(object = Raw.data,
normalization.method = "LogNormalize",
scale.factor = round(median(Raw.data@meta.data$nUMI)),
display.progress = F)
# Assign Cell-Cycle Scores
s.genes <- c("Mcm5", "Pcna", "Tym5", "Fen1", "Mcm2", "Mcm4", "Rrm1", "Ung", "Gins2", "Mcm6", "Cdca7", "Dtl", "Prim1", "Uhrf1", "Mlf1ip", "Hells", "Rfc2", "Rap2", "Nasp", "Rad51ap1", "Gmnn", "Wdr76", "Slbp", "Ccne2", "Ubr7", "Pold3", "Msh2", "Atad2", "Rad51", "Rrm2", "Cdc45", "Cdc6", "Exo1", "Tipin", "Dscc1", "Blm", " Casp8ap2", "Usp1", "Clspn", "Pola1", "Chaf1b", "Brip1", "E2f8")
g2m.genes <- c("Hmgb2", "Ddk1","Nusap1", "Ube2c", "Birc5", "Tpx2", "Top2a", "Ndc80", "Cks2", "Nuf2", "Cks1b", "Mki67", "Tmpo", " Cenpk", "Tacc3", "Fam64a", "Smc4", "Ccnb2", "Ckap2l", "Ckap2", "Aurkb", "Bub1", "Kif11", "Anp32e", "Tubb4b", "Gtse1", "kif20b", "Hjurp", "Cdca3", "Hn1", "Cdc20", "Ttk", "Cdc25c", "kif2c", "Rangap1", "Ncapd2", "Dlgap5", "Cdca2", "Cdca8", "Ect2", "Kif23", "Hmmr", "Aurka", "Psrc1", "Anln", "Lbr", "Ckap5", "Cenpe", "Ctcf", "Nek2", "G2e3", "Gas2l3", "Cbx5", "Cenpa")
Raw.data <- CellCycleScoring(object = Raw.data,
s.genes = s.genes,
g2m.genes = g2m.genes,
set.ident = TRUE)
# Regresse-out uninterresting source of variation and scale the expression matrix
Raw.data@meta.data$CC.Difference <- Raw.data@meta.data$S.Score - Raw.data@meta.data$G2M.Score
Raw.data <- ScaleData(object = Raw.data,
vars.to.regress = c("CC.Difference","percent.mito","nGene", "nUMI"),
display.progress = F)
#Remove Scrublet infered doublets
Valid.Cells <- rownames(Raw.data@meta.data[Raw.data@meta.data$Predicted_doublets == "Singlet",])
#Manual removal of 9 cells during the initial spring visualisation
manual.removed <- read.table("./1-QualityControl/SpringCoordinates/Spring.outlyer.csv", header = T, sep = ";")
Cells.to.keep <- Valid.Cells[!Valid.Cells %in% manual.removed$Barcode]
Raw.data@meta.data$Cells.to.keep <- rownames(Raw.data@meta.data) %in% Cells.to.keep
QCFiltered.data <- SubsetData(Raw.data, cells.use = Cells.to.keep, subset.raw = T, do.clean = F)
# Export raw expression matrix and gene list to regenerate a spring plot
exprData <- Matrix(as.matrix(QCFiltered.data@raw.data), sparse = TRUE)
writeMM(exprData, "./1-QualityControl/SpringCoordinates/ExprData.mtx")
## NULL
Genelist <- row.names(QCFiltered.data@raw.data)
write.table(Genelist, "./1-QualityControl/SpringCoordinates/Genelist.csv", sep="\t", col.names = F, row.names = F)
Spring coordinates were generated using the online version of SPRING with the following parameters :
Number of cells: 4225
Number of genes that passed filter: 1400
Min expressing cells (gene filtering): 3
Min number of UMIs (gene filtering): 3
Gene variability %ile (gene filtering): 90
Number of principal components: 25
Number of nearest neighbors: 8
Number of force layout iterations: 500
# Import Spring coordinates
Coordinates <- read.table("./1-QualityControl/SpringCoordinates/Allcells_SpringCoordinates.txt", sep=",", header = F)[,c(2,3)]
rownames(Coordinates) <- colnames(QCFiltered.data@data)
QCFiltered.data <- SetDimReduction(QCFiltered.data,
reduction.type = "spring",
slot = "cell.embeddings",
new.data = as.matrix(Coordinates))
QCFiltered.data@dr$spring@key <- "spring"
colnames(QCFiltered.data@dr$spring@cell.embeddings) <- paste0(GetDimReduction(object=QCFiltered.data, reduction.type = "spring",slot = "key"), c(1,2))
# Symmetry transform of the coordinates
Spring.Sym <- function(x){
x = abs(max(QCFiltered.data@dr$spring@cell.embeddings[,2])-x)
}
QCFiltered.data@dr$spring@cell.embeddings[,2] <- sapply(QCFiltered.data@dr$spring@cell.embeddings[,2] , function(x) Spring.Sym(x))
QCFiltered.data@dr$spring@cell.embeddings[,1] <- sapply(QCFiltered.data@dr$spring@cell.embeddings[,1] , function(x) Spring.Sym(x))
QCFiltered.data <- SetAllIdent(QCFiltered.data, id = "old.ident")
p <- DimPlot(QCFiltered.data,
reduction.use = "spring",
cols.use = "#969696",
dim.1 = 1,
dim.2 = 2,
do.label= F,
no.legend = T,
do.return = T)
ncells <- dim(QCFiltered.data@data)[2]
ngenes <- dim(QCFiltered.data@data)[1]
p + annotate(geom = "text", label = paste0(ncells," cells\n", ngenes," genes"), x = 500, y = 400)
## [1] "30 novembre, 2020, 10,26"
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=fr_FR.UTF-8
## [5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=fr_FR.UTF-8
## [7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reticulate_1.13 ggExtra_0.9 RColorBrewer_1.1-2 dplyr_0.8.3
## [5] Seurat_2.3.4 Matrix_1.2-17 cowplot_1.0.0 ggplot2_3.2.1
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 class_7.3-17
## [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.5
## [7] htmlTable_1.13.2 base64enc_0.1-3 rstudioapi_0.11
## [10] proxy_0.4-23 farver_2.0.1 npsurv_0.4-0
## [13] flexmix_2.3-15 bit64_4.0.2 codetools_0.2-16
## [16] splines_3.6.3 R.methodsS3_1.7.1 lsei_1.2-0
## [19] robustbase_0.93-5 knitr_1.26 zeallot_0.1.0
## [22] jsonlite_1.7.0 Formula_1.2-3 ica_1.0-2
## [25] cluster_2.1.0 kernlab_0.9-29 png_0.1-7
## [28] R.oo_1.23.0 shiny_1.4.0 compiler_3.6.3
## [31] httr_1.4.1 backports_1.1.5 fastmap_1.0.1
## [34] assertthat_0.2.1 lazyeval_0.2.2 later_1.0.0
## [37] lars_1.2 acepack_1.4.1 htmltools_0.5.0
## [40] tools_3.6.3 igraph_1.2.5 gtable_0.3.0
## [43] glue_1.4.1 RANN_2.6.1 reshape2_1.4.3
## [46] Rcpp_1.0.5 vctrs_0.2.0 gdata_2.18.0
## [49] ape_5.3 nlme_3.1-141 iterators_1.0.12
## [52] fpc_2.2-3 gbRd_0.4-11 lmtest_0.9-37
## [55] xfun_0.18 stringr_1.4.0 mime_0.7
## [58] miniUI_0.1.1.1 lifecycle_0.1.0 irlba_2.3.3
## [61] gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-53
## [64] zoo_1.8-6 scales_1.1.0 promises_1.1.0
## [67] doSNOW_1.0.18 parallel_3.6.3 yaml_2.2.1
## [70] pbapply_1.4-2 gridExtra_2.3 rpart_4.1-15
## [73] segmented_1.0-0 latticeExtra_0.6-28 stringi_1.4.6
## [76] foreach_1.4.7 checkmate_1.9.4 caTools_1.17.1.2
## [79] bibtex_0.4.2 Rdpack_0.11-0 SDMTools_1.1-221.1
## [82] rlang_0.4.7 pkgconfig_2.0.3 dtw_1.21-3
## [85] prabclus_2.3-1 bitops_1.0-6 evaluate_0.14
## [88] lattice_0.20-41 ROCR_1.0-7 purrr_0.3.3
## [91] labeling_0.3 htmlwidgets_1.5.1 bit_4.0.4
## [94] tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
## [97] R6_2.4.1 snow_0.4-3 gplots_3.0.1.1
## [100] Hmisc_4.3-0 pillar_1.4.2 foreign_0.8-72
## [103] withr_2.1.2 fitdistrplus_1.0-14 mixtools_1.1.0
## [106] survival_2.44-1.1 nnet_7.3-14 tsne_0.1-3
## [109] tibble_2.1.3 crayon_1.3.4 hdf5r_1.3.2.9000
## [112] KernSmooth_2.23-15 rmarkdown_2.5 grid_3.6.3
## [115] data.table_1.12.6 metap_1.1 digest_0.6.25
## [118] diptest_0.75-7 xtable_1.8-4 httpuv_1.5.2
## [121] tidyr_1.0.0 R.utils_2.9.0 stats4_3.6.3
## [124] munsell_0.5.0
Institute of Psychiatry and Neuroscience of Paris, INSERM U1266, 75014, Paris, France, matthieu.moreau@inserm.fr↩