cellassign(github)是对单细胞数据进行自动细胞类型注释的工具。根据marker基因的信息,将单细胞RNA-seq测序测量的细胞分配给已知的细胞类型。
install.packages("tensorflow")
tensorflow::install_tensorflow(extra_packages='tensorflow-probability')
注意这里可能由于网络问题不能安装,可以使用命令pip install --upgrade --no-user --ignore-installed 'tensorflow==2.11.*' 'tensorflow-probability' -i https://pypi.tuna.tsinghua.edu.cn/simple安装
pip install --upgrade --no-user --ignore-installed 'tensorflow==2.11.*' 'tensorflow-probability' -i https://pypi.tuna.tsinghua.edu.cn/simple
devtools::install_github("Irrationone/cellassign")
安装完成后,我们就可以使用将单细胞分配给已知的细胞类型
这里我们构建binary marker gene matrix编码的先验知识的细胞类型首先构建一个list对象,其中名称是细胞类型,条目是每种细胞类型的标记基因(不一定相互排斥) :
binary marker gene matrix
list
marker_gene_list <- list( "Naive CD4+ T" = c("IL7R", "CCR7"), "CD14+ Mono" = c("CD14", "LYZ"), "Memory CD4+" = c("IL7R", "S100A4"), "B" = c("MS4A1"), "CD8+ T" = c("CD8A"), "FCGR3A+ Mono" = c("FCGR3A","MS4A7"), "NK" = c("GNLY","NKG7"), "DC" = c("FCER1A","CST3"), "Platelet" = c("PPBP") )
使用函数marker_list_to_mat将list对象转换为binary marker gene matrix
marker_list_to_mat
marker_gene_mat <- marker_list_to_mat(marker_gene_list) head(marker_gene_mat)
Naive CD4+ T CD14+ Mono Memory CD4+ B CD8+ T FCGR3A+ Mono NK DC Platelet other CCR7 1 0 0 0 0 0 0 0 0 0 CD14 0 1 0 0 0 0 0 0 0 0 CD8A 0 0 0 0 1 0 0 0 0 0 CST3 0 0 0 0 0 0 0 1 0 0 FCER1A 0 0 0 0 0 0 0 1 0 0 FCGR3A 0 0 0 0 0 1 0 0 0 0
从seurat对象中加载count数据测试数据
library(Seurat) pbmc.0 <- readRDS("pmbc_1683691139445.rds") DimPlot(pbmc.0) obj <- pbmc.0 mat <- obj@assays$RNA@counts[rownames(marker_gene_mat),] |> as.matrix() mat <- mat[,colSums(mat)!=0]
fit <- cellassign(t(mat), marker_gene_info = marker_gene_mat, s = colSums(mat), learning_rate = 1e-2, shrinkage = TRUE, verbose = FALSE)
print(fit)
A cellassign fit for 2611 cells, 14 genes, 10 cell types with 0 covariates To access cell types, call celltypes(x) To access cell type probabilities, call cellprobs(x) > dim(mat) [1] 14 2611 > dim(marker_gene_mat) [1] 14 10
print(head(celltypes(fit)))
[1] "unassigned" "B" "unassigned" "unassigned" "unassigned" "unassigned" > length(celltypes(fit)) [1] 2611
pheatmap::pheatmap(cellprobs(fit))
在umap图上展示鉴定的细胞类型
library(ggplot2) obj.filter <- obj[,colnames(mat)] obj.filter@meta.data$cellassign <-celltypes(fit) pd1 <- DimPlot(obj.filter, label = T) pd2 <- DimPlot(obj.filter, label = T,group.by = "cellassign") pd1+pd2
feature <- c("MS4A1","FCGR3A","MS4A7","FCER1A","CST3","PPBP","CD8A","CCR7","IL7R","S100A4","CD14", "LYZ","GNLY","NKG7") p1 <- DotPlot(obj.filter,features =feature)+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) p2 <- DotPlot(obj.filter,features = feature,group.by = "cellassign")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) p1+p2
cellassign推荐使用scran包的函数computeSumFactors计算size factors。
size factors
library(SingleCellExperiment) library(scran) sceset <- SingleCellExperiment(assays=list(counts=mat)) qclust <- quickCluster(sceset,min.size=30) sceset <- computeSumFactors(sceset,sizes=15,clusters=qclust) s <- sizeFactors(sceset) fit <- cellassign(sceset[rownames(marker_gene_mat),], marker_gene_info = marker_gene_mat, s =s, learning_rate = 1e-2, shrinkage = TRUE, verbose = FALSE)
在许多情况下,细胞类型的标记基因是从先验知识或者数据库(CellMarker)中获取。如果存在纯化的表达数据(bulk or single-cell) ,使用 scran R 软件包中的 findMarkers 函数可以快速获得标记基因。
scran
findMarkers
从Holik et al. Nucleic Acids Research 2017获取大量的 RNA-seq 数据,以获得3个不同细胞系的标记基因。被存储在holik_data
holik_data
data(holik_data)
它包含一个计数矩阵,其中每一行是一个基因(索引由 entrez ID) ,每一列是一个样本:
head(holik_data$counts[,1:2])
RSCE_10_BC2CTUACXX_AGTTCC_L006_R1.bam RSCE_12_BC2CTUACXX_TGACCA_L005_R1.bam 1 13 12 2 0 0 9 346 286 10 0 1 12 10 17 13 1 0
以及每个样本的细胞来源的载体:
head(holik_data$cell_line)
[1] "HCC827" "HCC827" "H2228" "H2228" "H838" "H838"
我们首先提供一个从entrez IDs到gene symbols的映射
entrez IDs
gene symbols
entrez_map <- select(org.Hs.eg.db, as.character(rownames(holik_data$counts)), c("SYMBOL"), "ENTREZID") #> 'select()' returned 1:1 mapping between keys and columns gene_annotations <- entrez_map %>% dplyr::rename(GeneID=ENTREZID, Symbol=SYMBOL)
然后构建 DGEList 对象,输入Limma voom,过滤掉低表达的基因:
DGELis
Limma voom
dge <- DGEList(counts = holik_data$counts, group = holik_data$cell_line, genes = gene_annotations, remove.zeros = TRUE) #> Removing 3620 rows with all zero counts genes_to_keep <- rowSums(cpm(dge$counts) > 0.5) >= 2 dge_filt <- dge[genes_to_keep,]
最后计算归一化因子:
dge_filt <- calcNormFactors(dge_filt, method="TMM")
我们接下来使用 Limma Voom 在3个样品的子集上执行差异表达: HCC827,H2228,H1975:
Limma Voom
dge_subset <- dge_filt[,dge_filt$samples$group %in% c("HCC827", "H2228", "H1975")] design <- model.matrix(~ 0+dge_subset$samples$group) colnames(design) <- levels(dge_subset$samples$group) v <- voom(dge_subset, design) fit <- lmFit(v, design)
接下来,进行拟合对比以发现细胞类型之间差异表达的基因:
contrast.matrix <- makeContrasts(H2228 - H1975, HCC827 - H1975, HCC827 - H2228, levels = design) fit2 <- contrasts.fit(fit, contrast.matrix) fit2 <- eBayes(fit2)
最后,计算基因总结统计并仅过滤显着差异表达的基因(FDR < 0.05) :
tt <- topTable(fit2, n=Inf) tt_sig <- tt %>% dplyr::filter(adj.P.Val < 0.05) head(tt_sig)
#> GeneID Symbol H2228...H1975 HCC827...H1975 HCC827...H2228 AveExpr #> 1 1956 EGFR 0.68777340 4.303436 3.615662 9.208711 #> 2 23480 SEC61G -0.40115418 4.414580 4.815734 7.784072 #> 3 81552 VOPP1 -0.09640406 4.347392 4.443796 6.998559 #> 4 1362 CPD 2.12265278 -2.103571 -4.226224 8.704180 #> 5 729086 LOC729086 -0.04338772 4.384976 4.428363 6.555193 #> 6 55915 LANCL2 -0.57454448 4.021963 4.596508 6.568625 #> F P.Value adj.P.Val #> 1 2026.616 3.447052e-12 2.982899e-08 #> 2 2003.024 3.623981e-12 2.982899e-08 #> 3 1766.548 6.199752e-12 3.232724e-08 #> 4 1671.376 7.854996e-12 3.232724e-08 #> 5 1506.362 1.224645e-11 3.628466e-08 #> 6 1479.497 1.322488e-11 3.628466e-08
为了推导标记基因,我们首先使用 H1975作为基线表达创建一个对数倍数变化矩阵:
lfc_table <- tt_sig[,c("H2228...H1975", "HCC827...H1975")] lfc_table <- lfc_table %>% dplyr::mutate(H1975=0, H2228=H2228...H1975, HCC827=HCC827...H1975) %>% dplyr::select(H1975, H2228, HCC827) rownames(lfc_table) <- tt_sig$GeneID
然后,对于每个基因,我们减去最小对数倍数变化,因为我们关心基因相对于某个最小表达水平的过度表达,因为这定义了一个标记基因:
lfc_table <- as.matrix(lfc_table) lfc_table <- lfc_table - rowMins(lfc_table) lfc_table <- as.data.frame(lfc_table)
现在,我们定义了一个辅助函数,用于将log fold change更改转换为二进制矩阵。这需要一个矩阵和一个阈值,任何小于或等于阈值的值都被设置为0,其他值都被设置为1:
log fold change
binarize <- function(x, threshold) { x[x <= threshold] <- -Inf x[x > -Inf] <- 1 x[x == -Inf] <- 0 return(x) }
接下来,我们实现了一个对这个矩阵进行二值化的基本过程。基本上,我们寻找每个基因最大的表达gap,表达在这个gap之上的细胞类型被指定具有该基因作为marker
gap
marker
# Find the biggest difference maxdiffs <- apply(lfc_table, 1, function(x) max(diff(sort(x)))) # thres_vals <- apply(lfc_table, 1, function(x) sort(x)[which.max(diff(sort(x)))]) expr_mat_thres <- plyr::rbind.fill(lapply(1:nrow(lfc_table), function(i) { binarize(lfc_table[i,], thres_vals[i]) })) rownames(expr_mat_thres) <- rownames(lfc_table) marker_gene_mat <- expr_mat_thres[(maxdiffs >= quantile(maxdiffs, c(.99))) & (thres_vals <= log(2)),] %>% as.matrix
最后,我们添加gene symbols,而不是entrez ids:
entrez ids
suppressMessages({ symbols <- plyr::mapvalues( rownames(marker_gene_mat), from = gene_annotations$GeneID, to = gene_annotations$Symbol ) }) is_na <- is.na(symbols) marker_gene_mat <- marker_gene_mat[!is_na,] rownames(marker_gene_mat) <- symbols[!is_na]
这里我们有一个细胞类型的标记基因矩阵:
head(marker_gene_mat)
#> H1975 H2228 HCC827 #> NRK 0 0 1 #> MTAP 1 0 1 #> CP 0 1 0 #> HIST1H3F 1 0 1 #> HIST1H4E 1 0 1 #> ADIRF 1 1 0
pheatmap(t(marker_gene_mat))
确定使用的新 Python 解释器的路径。例如,假设您希望使用位于 /usr/local/bin/python3.7 的 Python 3.7 解释器。
library(tensorflow) reticulate::use_python("/usr/local/bin/python3.7")