library(tidyverse) lung_exprs_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_exprs_data.tsv",show_col_types = FALSE) |> column_to_rownames("symbol") lung_phenotype_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_phenotype_data.tsv",show_col_types = FALSE) |> column_to_rownames("sample") lung_feature_data <- readr::read_tsv("https://gitee.com/bioinfoFungi/monocle1/raw/master/data/lung_feature_data.tsv",show_col_types = FALSE) |> column_to_rownames("symbol") head(lung_feature_data) library(monocle) pd <- new("AnnotatedDataFrame", data = lung_phenotype_data) fd <- new("AnnotatedDataFrame", data = lung_feature_data) HSMM <- newCellDataSet(as.matrix(lung_exprs_data), phenoData = pd, featureData = fd, expressionFamily=tobit()) expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10)) HSMM <- estimateSizeFactors(HSMM) HSMM <- estimateDispersions(HSMM) HSMM <- detectGenes(HSMM, min_expr = 0.1) print(head(fData(HSMM))) expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10)) pData(HSMM)$Total_mRNAs <- Matrix::colSums(exprs(HSMM)) HSMM <- load_HSMM() lung <- load_lung() load_HSMM <- function(){ HSMM_sample_sheet <- NA HSMM_gene_annotation <- NA HSMM_expr_matrix <- NA gene_short_name <- NA HSMM_expr_matrix <- readRDS("data/HSMM_expr_matrix.rds") HSMM_gene_annotation <- readRDS("data/HSMM_gene_annotation.rds") HSMM_sample_sheet <- readRDS("data/HSMM_sample_sheet.rds") pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet) fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation) HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix), phenoData = pd, featureData = fd) HSMM <- estimateSizeFactors(HSMM) HSMM <- estimateSizeFactors(HSMM) HSMM } ############################################################## HSMM <- load_HSMM() HSMM <- estimateSizeFactors(HSMM) HSMM <- estimateDispersions(HSMM) HSMM <- detectGenes(HSMM, min_expr = 0.1) print(head(fData(HSMM))) expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10)) print(head(pData(HSMM))) pData(HSMM)$Total_mRNAs <- Matrix::colSums(exprs(HSMM)) HSMM <- HSMM[,pData(HSMM)$Total_mRNAs < 1e6] upper_bound <- 10^(mean(log10(pData(HSMM)$Total_mRNAs)) + 2*sd(log10(pData(HSMM)$Total_mRNAs))) lower_bound <- 10^(mean(log10(pData(HSMM)$Total_mRNAs)) - 2*sd(log10(pData(HSMM)$Total_mRNAs))) qplot(Total_mRNAs, data = pData(HSMM), color = Hours, geom = "density") + geom_vline(xintercept = lower_bound) + geom_vline(xintercept = upper_bound) HSMM <- HSMM[,pData(HSMM)$Total_mRNAs > lower_bound & pData(HSMM)$Total_mRNAs < upper_bound] HSMM <- detectGenes(HSMM, min_expr = 0.1) print(head(fData(HSMM))) L <- log(exprs(HSMM[expressed_genes,])) ## TODO melted_dens_df <- melt(Matrix::t(scale(Matrix::t(L)))) MYF5_id <- row.names(subset(fData(HSMM), gene_short_name == "MYF5")) ANPEP_id <- row.names(subset(fData(HSMM),gene_short_name == "ANPEP")) cth <- newCellTypeHierarchy() cth <- addCellType(cth, "Myoblast", classify_func =function(x) { x[MYF5_id,] >= 1 }) cth <- addCellType(cth, "Fibroblast", classify_func = function(x){ x[MYF5_id,] < 1 & x[ANPEP_id,] > 1 }) HSMM <- classifyCells(HSMM, cth, 0.1) pData(HSMM)$CellType |> table() Myoblast_cells <- row.names(subset(pData(HSMM), CellType=="Myoblast")) HSMM_myo <- HSMM[,Myoblast_cells] ################################################################3 pie <- ggplot(pData(HSMM), aes(x = factor(1), fill = factor(CellType))) + geom_bar(width = 1) pie + coord_polar(theta = "y") + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) disp_table <- dispersionTable(HSMM) unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1) HSMM <- setOrderingFilter(HSMM, unsup_clustering_genes$gene_id) plot_ordering_genes(HSMM) plot_pc_variance_explained(HSMM, return_all = F) HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 6, reduction_method = 'tSNE', verbose = T) HSMM <- clusterCells(HSMM, num_clusters = 2) plot_cell_clusters(HSMM, 1, 2, color = "CellType", markers = c("MYF5", "ANPEP")) plot_cell_clusters(HSMM, 1, 2, color = "Media") HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 2, reduction_method = 'tSNE', residualModelFormulaStr = "~Media + num_genes_expressed", verbose = T) HSMM <- clusterCells(HSMM, num_clusters = 2) plot_cell_clusters(HSMM, 1, 2, color = "CellType") plot_cell_clusters(HSMM, 1, 2, color = "Cluster") + facet_wrap(~CellType) ########################################################## # Clustering cells using marker genes ########################################################## marker_diff <- markerDiffTable(HSMM[expressed_genes,], cth, residualModelFormulaStr = "~Media + num_genes_expressed", cores = 1) candidate_clustering_genes <- row.names(subset(marker_diff, qval < 0.01)) marker_spec <- calculateMarkerSpecificity(HSMM[candidate_clustering_genes,], cth) table(marker_spec$CellType) head(selectTopMarkers(marker_spec, 3)) semisup_clustering_genes <- unique(selectTopMarkers(marker_spec, 500)$gene_id) HSMM <- setOrderingFilter(HSMM, semisup_clustering_genes) plot_ordering_genes(HSMM) plot_pc_variance_explained(HSMM, return_all = F) HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 3, norm_method = 'log', reduction_method = 'tSNE', residualModelFormulaStr = "~Media + num_genes_expressed", verbose = T) HSMM <- clusterCells(HSMM, num_clusters = 2) plot_cell_clusters(HSMM, 1, 2, color = "CellType") ######################################################## HSMM <- clusterCells(HSMM, num_clusters = 2, frequency_thresh = 0.1, cell_type_hierarchy = cth) plot_cell_clusters(HSMM, 1, 2, color = "CellType", markers = c("MYF5", "ANPEP")) factor(pData(HSMM)$CellType) pie <- ggplot(pData(HSMM), aes(x = factor(1), fill = factor(CellType))) + geom_bar(width = 1) pie + coord_polar(theta = "y") + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) ######################################################## HSMM_myo ## Trajectory step 1: choose genes that define a cell's progress diff_test_res <- differentialGeneTest(HSMM_myo[expressed_genes,], fullModelFormulaStr = "~Media") ordering_genes <- row.names (subset(diff_test_res, qval < 0.01)) HSMM_myo <- setOrderingFilter(HSMM_myo, ordering_genes) plot_ordering_genes(HSMM_myo) ## Trajectory step 2: reduce data dimensionality HSMM_myo <- reduceDimension(HSMM_myo, max_components = 2, method = 'DDRTree') ## Trajectory step 3: order cells along the trajectory HSMM_myo <- orderCells(HSMM_myo) plot_cell_trajectory(HSMM_myo, color_by = "Hours") plot_cell_trajectory(HSMM_myo, color_by = "State") GM_state <- function(cds){ if (length(unique(pData(cds)$State)) > 1){ T0_counts <- table(pData(cds)$State, pData(cds)$Hours)[,"0"] return(as.numeric(names(T0_counts)[which (T0_counts == max(T0_counts))])) } else { return (1) } } GM_state(HSMM_myo) HSMM_myo <- orderCells(HSMM_myo, root_state = GM_state(HSMM_myo)) plot_cell_trajectory(HSMM_myo, color_by = "Pseudotime") plot_cell_trajectory(HSMM_myo, color_by = "State") + facet_wrap(~State, nrow = 1) blast_genes <- row.names(subset(fData(HSMM_myo), gene_short_name %in% c("CCNB2", "MYOD1", "MYOG"))) plot_genes_jitter(HSMM_myo[blast_genes,], grouping = "State", min_expr = 0.1) HSMM_expressed_genes <- row.names(subset(fData(HSMM_myo), num_cells_expressed >= 10)) HSMM_filtered <- HSMM_myo[HSMM_expressed_genes,] my_genes <- row.names(subset(fData(HSMM_filtered), gene_short_name %in% c("CDK1", "MEF2C", "MYH3"))) cds_subset <- HSMM_filtered[my_genes,] fData(cds_subset) plot_genes_in_pseudotime(cds_subset, color_by = "Hours") ############################################### HSMM_myo <- detectGenes(HSMM_myo, min_expr = 0.1) fData(HSMM_myo)$use_for_ordering <-fData(HSMM_myo)$num_cells_expressed > 0.05 * ncol(HSMM_myo) plot_pc_variance_explained(HSMM_myo, return_all = F) HSMM_myo <- reduceDimension(HSMM_myo, max_components = 2, norm_method = 'log', num_dim = 3, reduction_method = 'tSNE', verbose = T) HSMM_myo <- clusterCells(HSMM_myo, verbose = F) plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Cluster)') plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Hours)') plot_rho_delta(HSMM_myo, rho_threshold = 2, delta_threshold = 4 ) HSMM_myo <- clusterCells(HSMM_myo, rho_threshold = 2, delta_threshold = 4, skip_rho_sigma = T, verbose = F) plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Cluster)') plot_cell_clusters(HSMM_myo, color_by = 'as.factor(Hours)') clustering_DEG_genes <- differentialGeneTest(HSMM_myo[HSMM_expressed_genes,], fullModelFormulaStr = '~Cluster', cores = 1) ######################################################## marker_genes <- row.names(subset(fData(HSMM_myo), gene_short_name %in% c("MEF2C", "MEF2D", "MYF5", "ANPEP", "PDGFRA","MYOG", "TPM1", "TPM2", "MYH2", "MYH3", "NCAM1", "TNNT1", "TNNT2", "TNNC1", "CDK1", "CDK2", "CCNB1", "CCNB2", "CCND1", "CCNA1", "ID1"))) diff_test_res <- differentialGeneTest(HSMM_myo[marker_genes,], fullModelFormulaStr = "~Media") sig_genes <- subset(diff_test_res, qval < 0.1) sig_genes[,c("gene_short_name", "pval", "qval")] MYOG_ID1 <- HSMM_myo[row.names(subset(fData(HSMM_myo), gene_short_name %in% c("MYOG", "CCNB2"))),] plot_genes_jitter(MYOG_ID1, grouping = "Media", ncol= 2) ############################################## to_be_tested <- row.names(subset(fData(HSMM), gene_short_name %in% c("UBC", "NCAM1", "ANPEP"))) cds_subset <- HSMM[to_be_tested,] #### Finding Genes that Change as a Function of Pseudotime to_be_tested <- row.names(subset(fData(HSMM), gene_short_name %in% c("MYH3", "MEF2C", "CCNB2", "TNNT1"))) cds_subset <- HSMM_myo[to_be_tested,] diff_test_res <- differentialGeneTest(cds_subset, fullModelFormulaStr = "~sm.ns(Pseudotime)") diff_test_res[,c("gene_short_name", "pval", "qval")] plot_genes_in_pseudotime(cds_subset, color_by = "Hours") diff_test_res <- differentialGeneTest(HSMM_myo[marker_genes,], fullModelFormulaStr = "~sm.ns(Pseudotime)") sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1)) plot_pseudotime_heatmap(HSMM_myo[sig_gene_names,], num_clusters = 3, cores = 1, show_rownames = T) lung <- load_lung() plot_cell_trajectory(lung, color_by = "Time") BEAM_res <- BEAM(lung, branch_point = 1, cores = 1) BEAM_res <- BEAM_res[order(BEAM_res$qval),] BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")] plot_genes_branched_heatmap(lung[row.names(subset(BEAM_res, qval < 1e-4)),], branch_point = 1, num_clusters = 4, cores = 1, use_gene_short_name = T, show_rownames = T)