MultiBaC包消除不同组学数据之间的批次效应

66 阅读 0 评论 44 点赞

我是靠谱客的博主动听硬币，这篇文章主要介绍MultiBaC包消除不同组学数据之间的批次效应，现在分享给大家，希望可以做个参考。

MultiBaC包: 能够消除在不同批次中生成的不同组学之间的批次效应。因此，MultiBaC是一种利用现有数据集跨组学技术进行元分析的有效工具。

1. 包和演示数据集的加载

# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("MultiBaC")

library(MultiBaC)
data("multiyeast")

2. 处理单一组学数据

# createMbac {MultiBaC}:This function creates a list object to be used by 
# MultiBaC function from a set of matrix R objects.
# batch source :different labs generating data, time or lab technician etc.
data_RNA <- createMbac (inputOmics = list(A.rna, B.rna, C.rna), 
                        batchFactor = c("A", "B", "C"),
                        experimentalDesign = list("A" =  c("Glu+", "Glu+", 
                                                           "Glu+", "Glu-", 
                                                           "Glu-", "Glu-"),
                                                  "B" = c("Glu+", "Glu+", 
                                                          "Glu-", "Glu-"),
                                                  "C" = c("Glu+", "Glu+", 
                                                          "Glu-", "Glu-")),
                        omicNames = "RNA")

custom_col <- c("brown2", "dodgerblue", "forestgreen") # 颜色表示不同批次
custom_pch <- c(19,19,19,1,1,1,
                19,19,1,1,
                19,19,1,1)  # 形状表示样品的不同处理

### 原始数据，PCA作图查看批次效应
# ?plot.mbac
plot(data_RNA, typeP="pca.org", 
     bty = "L", # 只有下框
     pch = custom_pch, # 点形状表示样本处理方式
     cex = 3, # 点的大小
     col.per.group = custom_col, # 不同批次颜色
     # 注释文字
     legend.text = c("Color: Batch", names(data_RNA$ListOfBatches),
                     "Fill: Cond.", unique(colData(data_RNA$ListOfBatches$A)$tfactor)),
     # 注释位置，形状，大小，颜色等
     args.legend = list("x" = "topright",
                        "pch" = c(NA, 15, 15, 15, 
                                  NA, 15, 0),
                        "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                  NA, 1, 1),
                        "bty" = "n",
                        "cex" = 1))

par(mfrow = c(1,2))
### 去除批次效应 (ARSyN:ASCA Removal of Systematic Noise)
#Interaction = FALSE, experimental factors和bacth factor没有相互作用
arsyn_1 <- ARSyNbac(data_RNA, modelName = "RNA", Variability = 0.95, 
                    batchEstimation = TRUE, 
                    Interaction = FALSE)
# 校正后数据的PCA作图
plot(arsyn_1, typeP="pca.cor", bty = "L",
     pch = custom_pch, cex = 3, col.per.group = custom_col,
     legend.text = c("Color: Batch", names(data_RNA$ListOfBatches),
                     "Fill: Cond.", unique(colData(data_RNA$ListOfBatches$A)$tfactor)),
     args.legend = list("x" = "topright",
                        "pch" = c(NA, 15, 15, 15, 
                                  NA, 15, 0),
                        "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                  NA, 1, 1),
                        "bty" = "n",
                        "cex" = 1.2))
# plot函数同时显示校正前和校正后的PCA图
plot(arsyn_1, typeP="pca.both", bty = "L",
     pch = custom_pch, cex = 1, col.per.group = custom_col,
     legend.text = c("Color: Batch", names(data_RNA$ListOfBatches),
                     "Fill: Cond.", unique(colData(data_RNA$ListOfBatches$A)$tfactor)),
     args.legend = list("x" = "topright",
                        "pch" = c(NA, 15, 15, 15, 
                                  NA, 15, 0),
                        "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                  NA, 1, 1),
                        "bty" = "n",
                        "cex" = 0.5))
# plot_pca函数显示校正前和校正后的PCA图
plot_pca(arsyn_1, col.by.batch = TRUE, col.per.group = NULL,
         comp2plot = c(1, 2), typeP = "pca.both", 
         legend.text = c("Color: Batch", names(data_RNA$ListOfBatches),
                         "Fill: Cond.", unique(colData(data_RNA$ListOfBatches$A)$tfactor)),
         args.legend = list("x" = "topright",
                            "pch" = c(NA, 15, 15, 15, 
                                      NA, 15, 0),
                            "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                      NA, 1, 1),
                            "bty" = "n",
                            "cex" = 0.5))


par(mfrow = c(1,2))
# Interaction = TRUE, experimental factors和bacth factor有相互作用
arsyn_2 <- ARSyNbac(data_RNA, modelName = "RNA", Variability = 0.95, 
                    batchEstimation = TRUE, 
                    Interaction = TRUE)
plot(arsyn_2, typeP="pca.cor", bty = "L",
     pch = custom_pch, cex = 3, col.per.group = custom_col,
     legend.text = c("Color: Batch", names(data_RNA$ListOfBatches),
                     "Fill: Cond.", unique(colData(data_RNA$ListOfBatches$A)$tfactor)),
     args.legend = list("x" = "topright",
                        "pch" = c(NA, 15, 15, 15, 
                                  NA, 15, 0),
                        "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                  NA, 1, 1),
                        "bty" = "n",
                        "cex" = 1.2))                

# Displays the structure and the content of the object of class mbac.
summary(data_RNA)

3. 处理多组学数据

# 不同batch要含有共同的组学数据，这里为rna数据。
my_mbac <- createMbac (inputOmics = list(A.rna, A.gro, 
                                         B.rna, B.ribo, 
                                         C.rna, C.par), 
                       batchFactor = c("A", "A", 
                                       "B", "B", 
                                       "C", "C"),
                       experimentalDesign = list("A" =  c("Glu+", "Glu+", 
                                                          "Glu+", "Glu-", 
                                                          "Glu-", "Glu-"),
                                                 "B" = c("Glu+", "Glu+", 
                                                         "Glu-", "Glu-"),
                                                 "C" = c("Glu+", "Glu+", 
                                                         "Glu-", "Glu-")),
                       omicNames = c("RNA", "GRO", 
                                     "RNA", "RIBO", 
                                     "RNA", "PAR"))
# This function uses linear models to estimate the batch effect 
# magnitude using the common data across batches.
batchEstPlot(my_mbac)

## 一步法校正
# MultiBaC: performs a multi-omic, multi-batch correction
my_final_mbac <- MultiBaC (my_mbac,
                           test.comp = NULL, scale = FALSE, 
                           center = TRUE, crossval = NULL, 
                           Interaction = FALSE,
                           Variability = 0.9,
                           showplot = TRUE,
                           showinfo = TRUE)

## 分多步校正，结果一样
# genModelList {MultiBaC}：This function performs PLS models for every batch. 
# A PLS model is generated for each non-common omic in each batch.
# PLS：multivariate partial least squares (PLS)
my_mbac_2 <- genModelList (my_mbac, test.comp = NULL, 
                           scale = FALSE, center = TRUE,
                           crossval = NULL,
                           showinfo = TRUE)

# genMissingOmics {MultiBaC}:This function generates for all the batches 
# the omic data they had not originally. 
# This is the previous step to apply ARSyNbac [1] correction.
multiBatchDesign <- genMissingOmics(my_mbac_2)

# batchCorrection {MultiBaC}: This function performs the ARSyNbac 
# correction [1] for each omic contained in mulBatchDesign input object.
my_finalwise_mbac <- batchCorrection(my_mbac_2, 
                                     multiBatchDesign = multiBatchDesign,
                                     Interaction = FALSE,
                                     Variability = 0.90)

plot(my_final_mbac, typeP = "batch")
plot (my_finalwise_mbac, typeP = "batch")

# 校正前
plot (my_final_mbac, typeP = "pca.org",
      cex.axis = 1, cex.lab = 1, cex = 3, bty = "L", 
      cex.main = 1.2, pch = 19)
# 校正后
plot (my_final_mbac, typeP = "pca.cor", 
      cex.axis = 1, cex.lab = 1, cex = 3, bty = "L", 
      cex.main = 1.2, pch = 19)


custom_col <- c("brown2", "dodgerblue", "forestgreen")
custom_pch <- c(19,19,19,1,1,1,15,15,15,0,0,0, # batch A
                19,19,1,1,17,17,2,2,  # batch B
                19,19,1,1,18,18,5,5)  # batch C

plot(my_final_mbac, typeP = "pca.both", col.by.batch = TRUE, 
     col.per.group = custom_col, comp2plot = 1:3,
     cex.axis = 1.3, cex.lab = 1.2, cex = 3, bty = "L", 
     cex.main = 1.7, pch = custom_pch,
     legend.text = c("Color", names(my_final_mbac$ListOfBatches),
                     "Shape", c("RNA", "GRO", "RIBO", "PAR"),
                     "Fill", levels(colData(my_final_mbac$ListOfBatches$A)$tfactor)),
     args.legend = list("x" = "topright",
                        "pch" = c(NA, 15, 15, 15, 
                                  NA, 19, 15, 17, 18, 
                                  NA, 19, 1),
                        "col" = c(NA, "brown2", "dodgerblue", "forestgreen",
                                  NA, rep(1, 4),
                                  NA, 1, 1),
                        "bty" = "n",
                        "cex" = 2))

# Q2_plot(mbac, ...), Q2 plot of PLS models is displayed.
# mbac:Object of class mbac generated by *MultiBaC* or *genModelList*.

Q2_plot(my_final_mbac)

4. 提取校正后的表达矩阵

### 得到校正后的表达矩阵
A_rna_cor <- assays(my_final_mbac$CorrectedData$A)$RNA
AA_rna_cor <- assays(my_finalwise_mbac$CorrectedData$A)$RNA 
head(A_rna_cor)
head(AA_rna_cor)
table(A_rna_cor==AA_rna_cor)

head(A.rna)