标准化

 log2(18*1000000/sum(dat[,3])+1)
  ## 18 -- > 6.459884  ## SS2_15_0048_A5
  dat=log2(edgeR::cpm(dat)+1) 
  ##归一化的一种选择,这里是CPM(count-per-million,每百万碱基中每个转录本的count值)
  ###CPM只对read count相对总reads数做了数量的均一化,去除文库大小差异。
  apply(mRNA_count,2,function(x)x/sum(x)*1000000)

归一化

以下三者等价

pheatmap(expr_sig,col=col,show_colnames=F,show_rownames=F,scale="row")
pheatmap(t(scale(t(expr_sig))),col=col,show_colnames=F,show_rownames=F)
pheatmap(t(apply(expr_sig,1,function(x){ ( x-mean(x))/sd(x) })),col=col,show_colnames=F,show_rownames=F)

过滤表达量为0的行

dat=a[apply(a,1, function(x) sum(x>1) > floor(ncol(a)/50)),]

scale

scale: 对列进行操作

mat <- matrix(1:12, ncol = 3)
     [,1] [,2] [,3]
[1,]    1    5    9
[2,]    2    6   10
[3,]    3    7   11
[4,]    4    8   12
scale(mat,center = T, scale = T)
apply(mat,2,function(x){ ( x-mean(x)) /sd(x) })
           [,1]       [,2]       [,3]
[1,] -1.1618950 -1.1618950 -1.1618950
[2,] -0.3872983 -0.3872983 -0.3872983
[3,]  0.3872983  0.3872983  0.3872983
[4,]  1.1618950  1.1618950  1.1618950

scale默认对行进行归一化(减去均值,除以标准差)

gene_count <- apply(gene_count,1,function(x){ (x-mean(x)) / sd(x)})%>%t()

图片alt

图片alt

limma标准化

 dge <- DGEList(counts=expr)
 dge <- calcNormFactors(dge)   
 v <- voom(dge,design,plot=TRUE, normalize="quantile") 
  par(mfrow=c(1,1),mar=c(7,3,2,2))
 boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
 expr <- expr[complete.cases(expr), ]
 v <- vooma(log2(expr), design, plot=T)
 expr <- normalizeBetweenArrays(expr) # normalize data
 par(mfrow=c(1,1),mar=c(7,3,2,2))
 boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
      qx <- as.numeric(quantile(expr, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
      LogC <- (qx[5] > 100) ||
        (qx[6]-qx[1] > 50 && qx[2] > 0)
      if (LogC) {
        message("##############",GEO,"log2 转换")
        expr[which(expr <= 0)] <- NaN
        expr <- log2(expr) 
      }
      expr<- normalizeBetweenArrays(expr)
      par(mfrow=c(1,1),mar=c(7,3,2,2))
      boxplot(expr, boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])

标准化 VS 归一化

       GSM1062765 GSM1062766 GSM1062767 GSM1062768 GSM1062769
ABCA8    6.782756   7.128726   4.571173   5.631289   7.457750
ABCC3   10.612708  10.651995   9.720073   9.887744   6.458206
ABI3BP  10.810602   9.640165   8.725722   7.912306   8.213121
ACADL    5.396239   7.040040   4.790250   7.120997   5.107281
ADGRF1   5.463918   9.036063   9.297348  10.522882   5.316363
rowSums(t(apply(expr_sig,1,function(x){ ( x-mean(x))/sd(x)  })))[1:5]
#        ABCA8         ABCC3        ABI3BP         ACADL        ADGRF1 
#-2.681882e-15  2.740169e-14 -1.838980e-14 -1.357768e-14  2.004646e-14 
rowSums(t(apply(expr_sig,1,function(x){ ( x/sum(x))*1000 })))[1:5]
# ABCA8  ABCC3 ABI3BP  ACADL ADGRF1 
#  1000   1000   1000   1000   1000 

DESeq2 Vst

vsd <- varianceStabilizingTransformation(dds_filt,blind=FALSE) 
vsd@assays@data@listData[[1]]
assay(vsd)