cellassign
cellassign(github)是对单细胞数据进行自动细胞类型注释的工具。根据marker基因的信息,将单细胞RNA-seq测序测量的细胞分配给已知的细胞类型。
- cellassign依赖tensorflow(v2.0.0),因此首先需要安装tensorflow
install.packages("tensorflow")
tensorflow::install_tensorflow(extra_packages='tensorflow-probability')
注意这里可能由于网络问题不能安装,可以使用命令
pip install --upgrade --no-user --ignore-installed 'tensorflow==2.11.*' 'tensorflow-probability' -i https://pypi.tuna.tsinghua.edu.cn/simple
安装
- 接着使用devtools从github安装cellassign
devtools::install_github("Irrationone/cellassign")
安装完成后,我们就可以使用将单细胞分配给已知的细胞类型
利用cellassign注释细胞类型
根据先验知识构建marker基因矩阵
这里我们构建binary marker gene matrix
编码的先验知识的细胞类型
首先构建一个list
对象,其中名称是细胞类型,条目是每种细胞类型的标记基因(不一定相互排斥) :
marker_gene_list <- list(
"Naive CD4+ T" = c("IL7R", "CCR7"),
"CD14+ Mono" = c("CD14", "LYZ"),
"Memory CD4+" = c("IL7R", "S100A4"),
"B" = c("MS4A1"),
"CD8+ T" = c("CD8A"),
"FCGR3A+ Mono" = c("FCGR3A","MS4A7"),
"NK" = c("GNLY","NKG7"),
"DC" = c("FCER1A","CST3"),
"Platelet" = c("PPBP")
)
使用函数marker_list_to_mat
将list对象转换为binary marker gene matrix
marker_gene_mat <- marker_list_to_mat(marker_gene_list)
head(marker_gene_mat)
Naive CD4+ T CD14+ Mono Memory CD4+ B CD8+ T FCGR3A+ Mono NK DC Platelet other
CCR7 1 0 0 0 0 0 0 0 0 0
CD14 0 1 0 0 0 0 0 0 0 0
CD8A 0 0 0 0 1 0 0 0 0 0
CST3 0 0 0 0 0 0 0 1 0 0
FCER1A 0 0 0 0 0 0 0 1 0 0
FCGR3A 0 0 0 0 0 1 0 0 0 0
从seurat对象中加载count数据
测试数据
library(Seurat)
pbmc.0 <- readRDS("pmbc_1683691139445.rds")
DimPlot(pbmc.0)
obj <- pbmc.0
mat <- obj@assays$RNA@counts[rownames(marker_gene_mat),] |> as.matrix()
mat <- mat[,colSums(mat)!=0]
fit <- cellassign(t(mat),
marker_gene_info = marker_gene_mat,
s = colSums(mat),
learning_rate = 1e-2,
shrinkage = TRUE,
verbose = FALSE)
print(fit)
A cellassign fit for 2611 cells, 14 genes, 10 cell types with 0 covariates
To access cell types, call celltypes(x)
To access cell type probabilities, call cellprobs(x)
> dim(mat)
[1] 14 2611
> dim(marker_gene_mat)
[1] 14 10
print(head(celltypes(fit)))
[1] "unassigned" "B" "unassigned" "unassigned" "unassigned" "unassigned"
> length(celltypes(fit))
[1] 2611
pheatmap::pheatmap(cellprobs(fit))
在umap图上展示鉴定的细胞类型
library(ggplot2)
obj.filter <- obj[,colnames(mat)]
obj.filter@meta.data$cellassign <-celltypes(fit)
pd1 <- DimPlot(obj.filter, label = T)
pd2 <- DimPlot(obj.filter, label = T,group.by = "cellassign")
pd1+pd2
使用点图显示不同细胞类型基因的表达量
feature <- c("MS4A1","FCGR3A","MS4A7","FCER1A","CST3","PPBP","CD8A","CCR7","IL7R","S100A4","CD14", "LYZ","GNLY","NKG7")
p1 <- DotPlot(obj.filter,features =feature)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p2 <- DotPlot(obj.filter,features = feature,group.by = "cellassign")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p1+p2
数据标准化
cellassign推荐使用scran包的函数computeSumFactors计算size factors
。
library(SingleCellExperiment)
library(scran)
sceset <- SingleCellExperiment(assays=list(counts=mat))
qclust <- quickCluster(sceset,min.size=30)
sceset <- computeSumFactors(sceset,sizes=15,clusters=qclust)
s <- sizeFactors(sceset)
fit <- cellassign(sceset[rownames(marker_gene_mat),],
marker_gene_info = marker_gene_mat,
s =s,
learning_rate = 1e-2,
shrinkage = TRUE,
verbose = FALSE)
pheatmap::pheatmap(cellprobs(fit))
library(ggplot2)
obj.filter <- obj[,colnames(mat)]
obj.filter@meta.data$cellassign <-celltypes(fit)
pd1 <- DimPlot(obj.filter, label = T)
pd2 <- DimPlot(obj.filter, label = T,group.by = "cellassign")
pd1+pd2
feature <- c("MS4A1","FCGR3A","MS4A7","FCER1A","CST3","PPBP","CD8A","CCR7","IL7R","S100A4","CD14", "LYZ","GNLY","NKG7")
p1 <- DotPlot(obj.filter,features =feature)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p2 <- DotPlot(obj.filter,features = feature,group.by = "cellassign")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p1+p2
利用纯化的 scRNA-seq 数据构建标记基因矩阵
在许多情况下,细胞类型的标记基因是从先验知识或者数据库(CellMarker)中获取。如果存在纯化的表达数据(bulk or single-cell) ,使用 scran
R 软件包中的 findMarkers
函数可以快速获得标记基因。
从Holik et al. Nucleic Acids Research 2017获取大量的 RNA-seq 数据,以获得3个不同细胞系的标记基因。被存储在holik_data
data(holik_data)
它包含一个计数矩阵,其中每一行是一个基因(索引由 entrez ID) ,每一列是一个样本:
head(holik_data$counts[,1:2])
RSCE_10_BC2CTUACXX_AGTTCC_L006_R1.bam RSCE_12_BC2CTUACXX_TGACCA_L005_R1.bam
1 13 12
2 0 0
9 346 286
10 0 1
12 10 17
13 1 0
以及每个样本的细胞来源的载体:
head(holik_data$cell_line)
[1] "HCC827" "HCC827" "H2228" "H2228" "H838" "H838"
准备
我们首先提供一个从entrez IDs
到gene symbols
的映射
entrez_map <- select(org.Hs.eg.db,
as.character(rownames(holik_data$counts)),
c("SYMBOL"), "ENTREZID")
#> 'select()' returned 1:1 mapping between keys and columns
gene_annotations <- entrez_map %>%
dplyr::rename(GeneID=ENTREZID,
Symbol=SYMBOL)
然后构建 DGELis
t 对象,输入Limma voom
,过滤掉低表达的基因:
dge <- DGEList(counts = holik_data$counts,
group = holik_data$cell_line,
genes = gene_annotations,
remove.zeros = TRUE)
#> Removing 3620 rows with all zero counts
genes_to_keep <- rowSums(cpm(dge$counts) > 0.5) >= 2
dge_filt <- dge[genes_to_keep,]
最后计算归一化因子:
dge_filt <- calcNormFactors(dge_filt, method="TMM")
差异表达
我们接下来使用 Limma Voom
在3个样品的子集上执行差异表达: HCC827,H2228,H1975:
dge_subset <- dge_filt[,dge_filt$samples$group %in% c("HCC827", "H2228", "H1975")]
design <- model.matrix(~ 0+dge_subset$samples$group)
colnames(design) <- levels(dge_subset$samples$group)
v <- voom(dge_subset, design)
fit <- lmFit(v, design)
接下来,进行拟合对比以发现细胞类型之间差异表达的基因:
contrast.matrix <- makeContrasts(H2228 - H1975,
HCC827 - H1975,
HCC827 - H2228,
levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
最后,计算基因总结统计并仅过滤显着差异表达的基因(FDR < 0.05) :
tt <- topTable(fit2, n=Inf)
tt_sig <- tt %>%
dplyr::filter(adj.P.Val < 0.05)
head(tt_sig)
#> GeneID Symbol H2228...H1975 HCC827...H1975 HCC827...H2228 AveExpr
#> 1 1956 EGFR 0.68777340 4.303436 3.615662 9.208711
#> 2 23480 SEC61G -0.40115418 4.414580 4.815734 7.784072
#> 3 81552 VOPP1 -0.09640406 4.347392 4.443796 6.998559
#> 4 1362 CPD 2.12265278 -2.103571 -4.226224 8.704180
#> 5 729086 LOC729086 -0.04338772 4.384976 4.428363 6.555193
#> 6 55915 LANCL2 -0.57454448 4.021963 4.596508 6.568625
#> F P.Value adj.P.Val
#> 1 2026.616 3.447052e-12 2.982899e-08
#> 2 2003.024 3.623981e-12 2.982899e-08
#> 3 1766.548 6.199752e-12 3.232724e-08
#> 4 1671.376 7.854996e-12 3.232724e-08
#> 5 1506.362 1.224645e-11 3.628466e-08
#> 6 1479.497 1.322488e-11 3.628466e-08
标记基因衍生
为了推导标记基因,我们首先使用 H1975作为基线表达创建一个对数倍数变化矩阵:
lfc_table <- tt_sig[,c("H2228...H1975", "HCC827...H1975")]
lfc_table <- lfc_table %>%
dplyr::mutate(H1975=0,
H2228=H2228...H1975,
HCC827=HCC827...H1975) %>%
dplyr::select(H1975, H2228, HCC827)
rownames(lfc_table) <- tt_sig$GeneID
然后,对于每个基因,我们减去最小对数倍数变化,因为我们关心基因相对于某个最小表达水平的过度表达,因为这定义了一个标记基因:
lfc_table <- as.matrix(lfc_table)
lfc_table <- lfc_table - rowMins(lfc_table)
lfc_table <- as.data.frame(lfc_table)
现在,我们定义了一个辅助函数,用于将log fold change
更改转换为二进制矩阵。这需要一个矩阵和一个阈值,任何小于或等于阈值的值都被设置为0,其他值都被设置为1:
binarize <- function(x, threshold) {
x[x <= threshold] <- -Inf
x[x > -Inf] <- 1
x[x == -Inf] <- 0
return(x)
}
接下来,我们实现了一个对这个矩阵进行二值化的基本过程。基本上,我们寻找每个基因最大的表达gap
,表达在这个gap
之上的细胞类型被指定具有该基因作为marker
# Find the biggest difference
maxdiffs <- apply(lfc_table, 1, function(x) max(diff(sort(x))))
#
thres_vals <- apply(lfc_table, 1, function(x) sort(x)[which.max(diff(sort(x)))])
expr_mat_thres <- plyr::rbind.fill(lapply(1:nrow(lfc_table), function(i) {
binarize(lfc_table[i,], thres_vals[i])
}))
rownames(expr_mat_thres) <- rownames(lfc_table)
marker_gene_mat <- expr_mat_thres[(maxdiffs >= quantile(maxdiffs, c(.99)))
& (thres_vals <= log(2)),] %>%
as.matrix
最后,我们添加gene symbols
,而不是entrez ids
:
suppressMessages({
symbols <- plyr::mapvalues(
rownames(marker_gene_mat),
from = gene_annotations$GeneID,
to = gene_annotations$Symbol
)
})
is_na <- is.na(symbols)
marker_gene_mat <- marker_gene_mat[!is_na,]
rownames(marker_gene_mat) <- symbols[!is_na]
这里我们有一个细胞类型的标记基因矩阵:
head(marker_gene_mat)
#> H1975 H2228 HCC827
#> NRK 0 0 1
#> MTAP 1 0 1
#> CP 0 1 0
#> HIST1H3F 1 0 1
#> HIST1H4E 1 0 1
#> ADIRF 1 1 0
pheatmap(t(marker_gene_mat))
安装存在的问题
更改R包 tensorflow 中使用的 Python 解释器位置
确定使用的新 Python 解释器的路径。例如,假设您希望使用位于 /usr/local/bin/python3.7 的 Python 3.7 解释器。
library(tensorflow)
reticulate::use_python("/usr/local/bin/python3.7")