展开

monocle2差异基因表达分析

最后发布时间 : 2023-01-11 16:56:54 浏览量 :
library(tidyverse)
lung_exprs_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_exprs_data.tsv",show_col_types = FALSE) |>
  column_to_rownames("symbol")
lung_phenotype_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_phenotype_data.tsv",show_col_types = FALSE) |>
  column_to_rownames("sample")
lung_feature_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_feature_data.tsv",show_col_types = FALSE) |>
  column_to_rownames("symbol")

head(lung_feature_data)
library(monocle)
pd <- new("AnnotatedDataFrame", data = lung_phenotype_data)
fd <- new("AnnotatedDataFrame", data = lung_feature_data)
HSMM <- newCellDataSet(as.matrix(lung_exprs_data),
                       phenoData = pd, featureData = fd, expressionFamily=tobit())

expressed_genes <- row.names(subset(fData(HSMM),
                                    num_cells_expressed >= 10))
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)


HSMM <- detectGenes(HSMM, min_expr = 0.1)
print(head(fData(HSMM)))
expressed_genes <- row.names(subset(fData(HSMM),
                                    num_cells_expressed >= 10))

pData(HSMM)$Total_mRNAs <- Matrix::colSums(exprs(HSMM))





HSMM <- load_HSMM()
lung <- load_lung()


load_HSMM <- function(){
  HSMM_sample_sheet <- NA
  HSMM_gene_annotation <- NA
  HSMM_expr_matrix <- NA
  gene_short_name <- NA
  HSMM_expr_matrix <- readRDS("data/HSMM_expr_matrix.rds")
  HSMM_gene_annotation <- readRDS("data/HSMM_gene_annotation.rds")
  HSMM_sample_sheet <- readRDS("data/HSMM_sample_sheet.rds")
  pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet)
  fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation)
  HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix), phenoData = pd, featureData = fd)
  HSMM <- estimateSizeFactors(HSMM)
  HSMM <- estimateSizeFactors(HSMM)
  
  HSMM
}



##############################################################
HSMM <- load_HSMM()
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
HSMM <- detectGenes(HSMM, min_expr = 0.1)
print(head(fData(HSMM)))
expressed_genes <- row.names(subset(fData(HSMM),
                                    num_cells_expressed >= 10))

print(head(pData(HSMM)))



pData(HSMM)$Total_mRNAs <- Matrix::colSums(exprs(HSMM))
HSMM <- HSMM[,pData(HSMM)$Total_mRNAs < 1e6]
upper_bound <- 10^(mean(log10(pData(HSMM)$Total_mRNAs)) +
                     2*sd(log10(pData(HSMM)$Total_mRNAs)))
lower_bound <- 10^(mean(log10(pData(HSMM)$Total_mRNAs)) -
                     2*sd(log10(pData(HSMM)$Total_mRNAs)))

qplot(Total_mRNAs, data = pData(HSMM), color = Hours, geom =
        "density") +
  geom_vline(xintercept = lower_bound) +
  geom_vline(xintercept = upper_bound)
HSMM <- HSMM[,pData(HSMM)$Total_mRNAs > lower_bound &
               pData(HSMM)$Total_mRNAs < upper_bound]
HSMM <- detectGenes(HSMM, min_expr = 0.1)
print(head(fData(HSMM)))

L <- log(exprs(HSMM[expressed_genes,]))
## TODO
melted_dens_df <- melt(Matrix::t(scale(Matrix::t(L))))

MYF5_id <- row.names(subset(fData(HSMM), gene_short_name == "MYF5"))
ANPEP_id <- row.names(subset(fData(HSMM),gene_short_name == "ANPEP"))
cth <- newCellTypeHierarchy()
cth <- addCellType(cth, "Myoblast", classify_func =function(x) { x[MYF5_id,] >= 1 })
cth <- addCellType(cth, "Fibroblast", classify_func = function(x){ x[MYF5_id,] < 1 & x[ANPEP_id,] > 1 })

HSMM <- classifyCells(HSMM, cth, 0.1)

pData(HSMM)$CellType |> table()
Myoblast_cells <- row.names(subset(pData(HSMM), CellType=="Myoblast"))
HSMM_myo <- HSMM[,Myoblast_cells]
################################################################3

pie <- ggplot(pData(HSMM),
              aes(x = factor(1), fill = factor(CellType))) + geom_bar(width = 1)
pie + coord_polar(theta = "y") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank())


disp_table <- dispersionTable(HSMM)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
HSMM <- setOrderingFilter(HSMM, unsup_clustering_genes$gene_id)
plot_ordering_genes(HSMM)


plot_pc_variance_explained(HSMM, return_all = F)

HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 6,
                        reduction_method = 'tSNE', verbose = T)

HSMM <- clusterCells(HSMM, num_clusters = 2)
plot_cell_clusters(HSMM, 1, 2, color = "CellType",
                   markers = c("MYF5", "ANPEP"))
plot_cell_clusters(HSMM, 1, 2, color = "Media")

HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 2,
                        reduction_method = 'tSNE',
                        residualModelFormulaStr = "~Media + num_genes_expressed",
                        verbose = T)
HSMM <- clusterCells(HSMM, num_clusters = 2)
plot_cell_clusters(HSMM, 1, 2, color = "CellType")


plot_cell_clusters(HSMM, 1, 2, color = "Cluster") +
  facet_wrap(~CellType)

##########################################################
# Clustering cells using marker genes
##########################################################
marker_diff <- markerDiffTable(HSMM[expressed_genes,],
                               cth,
                               residualModelFormulaStr = "~Media + num_genes_expressed",
                               cores = 1)
candidate_clustering_genes <- row.names(subset(marker_diff, qval < 0.01))
marker_spec <- calculateMarkerSpecificity(HSMM[candidate_clustering_genes,], cth)
table(marker_spec$CellType)
head(selectTopMarkers(marker_spec, 3))

semisup_clustering_genes <- unique(selectTopMarkers(marker_spec, 500)$gene_id)
HSMM <- setOrderingFilter(HSMM, semisup_clustering_genes)
plot_ordering_genes(HSMM)
plot_pc_variance_explained(HSMM, return_all = F)


HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 3,
                        norm_method = 'log',
                        reduction_method = 'tSNE',
                        residualModelFormulaStr = "~Media + num_genes_expressed",
                        verbose = T)

HSMM <- clusterCells(HSMM, num_clusters = 2)
plot_cell_clusters(HSMM, 1, 2, color = "CellType")

########################################################



HSMM <- clusterCells(HSMM,
                     num_clusters = 2,
                     frequency_thresh = 0.1,
                     cell_type_hierarchy = cth)

plot_cell_clusters(HSMM, 1, 2, color = "CellType",
                   markers = c("MYF5", "ANPEP"))
factor(pData(HSMM)$CellType)


pie <- ggplot(pData(HSMM), aes(x = factor(1), fill =
                                 factor(CellType))) +
  geom_bar(width = 1)
pie + coord_polar(theta = "y") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank())


########################################################
HSMM_myo
## Trajectory step 1: choose genes that define a cell's progress
diff_test_res <- differentialGeneTest(HSMM_myo[expressed_genes,],
                                      fullModelFormulaStr = "~Media")

ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))


HSMM_myo <- setOrderingFilter(HSMM_myo, ordering_genes)
plot_ordering_genes(HSMM_myo)
## Trajectory step 2: reduce data dimensionality

HSMM_myo <- reduceDimension(HSMM_myo, max_components = 2,
                            method = 'DDRTree')

## Trajectory step 3: order cells along the trajectory
HSMM_myo <- orderCells(HSMM_myo)

plot_cell_trajectory(HSMM_myo, color_by = "Hours")

plot_cell_trajectory(HSMM_myo, color_by = "State")

GM_state <- function(cds){
  if (length(unique(pData(cds)$State)) > 1){
    T0_counts <- table(pData(cds)$State, pData(cds)$Hours)[,"0"]
    return(as.numeric(names(T0_counts)[which
                                       (T0_counts == max(T0_counts))]))
  } else {
    return (1)
  }
}
GM_state(HSMM_myo)
HSMM_myo <- orderCells(HSMM_myo, root_state = GM_state(HSMM_myo))
plot_cell_trajectory(HSMM_myo, color_by = "Pseudotime")


plot_cell_trajectory(HSMM_myo, color_by = "State") +
  facet_wrap(~State, nrow = 1)




blast_genes <- row.names(subset(fData(HSMM_myo),
                                gene_short_name %in% c("CCNB2", "MYOD1", "MYOG")))
plot_genes_jitter(HSMM_myo[blast_genes,],
                  grouping = "State",
                  min_expr = 0.1)

HSMM_expressed_genes <-  row.names(subset(fData(HSMM_myo),
                                          num_cells_expressed >= 10))

HSMM_filtered <- HSMM_myo[HSMM_expressed_genes,]

my_genes <- row.names(subset(fData(HSMM_filtered),
                             gene_short_name %in% c("CDK1", "MEF2C", "MYH3")))
cds_subset <- HSMM_filtered[my_genes,]
fData(cds_subset)
plot_genes_in_pseudotime(cds_subset, color_by = "Hours")


###############################################
HSMM_myo <- detectGenes(HSMM_myo, min_expr = 0.1)
fData(HSMM_myo)$use_for_ordering <-fData(HSMM_myo)$num_cells_expressed > 0.05 * ncol(HSMM_myo)
plot_pc_variance_explained(HSMM_myo, return_all = F)
HSMM_myo <- reduceDimension(HSMM_myo,
                            max_components = 2,
                            norm_method = 'log',
                            num_dim = 3,
                            reduction_method = 'tSNE',
                            verbose = T)

HSMM_myo <- clusterCells(HSMM_myo, verbose = F)
plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Cluster)')
plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Hours)')

plot_rho_delta(HSMM_myo, rho_threshold = 2, delta_threshold = 4 )

HSMM_myo <- clusterCells(HSMM_myo,
                         rho_threshold = 2,
                         delta_threshold = 4,
                         skip_rho_sigma = T,
                         verbose = F)

plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Cluster)')
plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Hours)')

clustering_DEG_genes <-
  differentialGeneTest(HSMM_myo[HSMM_expressed_genes,],
                       fullModelFormulaStr = '~Cluster',
                       cores = 1)


########################################################

marker_genes <- row.names(subset(fData(HSMM_myo),
                                 gene_short_name %in% c("MEF2C", "MEF2D", "MYF5",
                                                        "ANPEP", "PDGFRA","MYOG",
                                                        "TPM1",  "TPM2",  "MYH2",
                                                        "MYH3",  "NCAM1", "TNNT1",
                                                        "TNNT2", "TNNC1", "CDK1",
                                                        "CDK2",  "CCNB1", "CCNB2",
                                                        "CCND1", "CCNA1", "ID1")))


diff_test_res <- differentialGeneTest(HSMM_myo[marker_genes,],
                                      fullModelFormulaStr = "~Media")

sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes[,c("gene_short_name", "pval", "qval")]


MYOG_ID1 <- HSMM_myo[row.names(subset(fData(HSMM_myo),
                                      gene_short_name %in% c("MYOG", "CCNB2"))),]

plot_genes_jitter(MYOG_ID1, grouping = "Media", ncol= 2)

##############################################
to_be_tested <- row.names(subset(fData(HSMM),
                                 gene_short_name %in% c("UBC", "NCAM1", "ANPEP")))
cds_subset <- HSMM[to_be_tested,]



#### Finding Genes that Change as a Function of Pseudotime
to_be_tested <- row.names(subset(fData(HSMM),
                                 gene_short_name %in% c("MYH3", "MEF2C", "CCNB2", "TNNT1")))
cds_subset <- HSMM_myo[to_be_tested,]
diff_test_res <- differentialGeneTest(cds_subset,
                                      fullModelFormulaStr = "~sm.ns(Pseudotime)")

diff_test_res[,c("gene_short_name", "pval", "qval")]
plot_genes_in_pseudotime(cds_subset, color_by = "Hours")



diff_test_res <- differentialGeneTest(HSMM_myo[marker_genes,],
                                      fullModelFormulaStr = "~sm.ns(Pseudotime)")

sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1))

plot_pseudotime_heatmap(HSMM_myo[sig_gene_names,],
                        num_clusters = 3,
                        cores = 1,
                        show_rownames = T)
lung <- load_lung()
plot_cell_trajectory(lung, color_by = "Time")

BEAM_res <- BEAM(lung, branch_point = 1, cores = 1)
BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
plot_genes_branched_heatmap(lung[row.names(subset(BEAM_res,
                                                  qval < 1e-4)),],
                            branch_point = 1,
                            num_clusters = 4,
                            cores = 1,
                            use_gene_short_name = T,
                            show_rownames = T)