我是靠谱客的博主 犹豫香水,最近开发中收集的这篇文章主要介绍Rtsne的问题总结visulize in python,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

今天在造dropout模拟数据集的时候,发现了一个问题,Rtsne的结果和skelarn中的tsne画图结果非常不同,让我感到很好奇,到底是什么原因导致这个的

%load_ext rpy2.ipython
%%R -o counts -o truecounts -o geneinfo -o cellinfo

# make sure that splatter is installed: https://github.com/Oshlack/splatter
suppressPackageStartupMessages({
  library(splatter)
  library(Rtsne)
  library(ggplot2)
  library(repr)
})


simulate <- function(nGroups=2, nGenes=200, batchCells=2000, dropout=3)
{
    if (nGroups > 1) method <- 'groups'
    else             method <- 'single'

    group.prob <- rep(1, nGroups) / nGroups

    # new splatter requires dropout.type
    if ('dropout.type' %in% slotNames(newSplatParams())) {
        if (dropout)
            dropout.type <- 'experiment'
        else
            dropout.type <- 'none'
        
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.type=dropout.type, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)

    } else {
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.present=!dropout, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)        
    }

    counts     <- as.data.frame(t(counts(sim)))
    truecounts <- as.data.frame(t(assays(sim)$TrueCounts))

    dropout    <- assays(sim)$Dropout
    mode(dropout) <- 'integer'

    cellinfo   <- as.data.frame(colData(sim))
    geneinfo   <- as.data.frame(rowData(sim))

    list(counts=counts,
         cellinfo=cellinfo,
         geneinfo=geneinfo,
         truecounts=truecounts)
}
sim <- simulate(nGroups=6, dropout=1) ## 因为这个地方的dropout从1变成了3,导致两者不一致

counts <- sim$counts
geneinfo <- sim$geneinfo
cellinfo <- sim$cellinfo
truecounts <- sim$truecounts

X <- t(counts) ## counts with dropout
Y <- as.integer(substring(cellinfo$Group,6))
Y <- Y-1

X.normalized <- apply(X, 2, function(z) z/sum(z)) ## 我大概知道这个图的原因了,因为sc.ppnor
X.normalized <- log(X.normalized + 1)
tsne.X <- Rtsne(t(X.normalized), max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

# sim <- simulate()

# counts <- sim$counts
# geneinfo <- sim$geneinfo
# cellinfo <- sim$cellinfo
# truecounts <- sim$truecounts
## 我也可以在R中显示的

结果如下
在这里插入图片描述

visulize in python

import scanpy as sc 
adata = sc.AnnData(counts.values, obs=cellinfo, var=geneinfo)
adata.obs_names = cellinfo.Cell
adata.var_names = geneinfo.Gene

adata.obs.index = list(adata.obs.index) ## add this line to avoid error
adata.var.index = list(adata.var.index) ## add this line to avoid error


#sc.pp.filter_genes(adata, min_counts=1)


adata_true = sc.AnnData(truecounts.values, obs=cellinfo, var=geneinfo)
adata_true.obs_names = cellinfo.Cell
adata_true.var_names = geneinfo.Gene
adata_true.obs.index = list(adata_true.obs.index) ## add this line to avoid error
adata_true.var.index = list(adata_true.var.index) ## add this line to avoid error

adata_true = adata_true[:, adata.var_names].copy()

sc.pp.normalize_per_cell(adata)
sc.pp.normalize_per_cell(adata_true)

sc.pp.log1p(adata)
sc.pp.log1p(adata_true)

sc.pp.pca(adata)
sc.pp.pca(adata_true)

sc.tl.tsne(adata)
sc.tl.tsne(adata_true)

sc.pp.neighbors(adata)
sc.pp.neighbors(adata_true)
sc.pl.tsne(adata,color=["Group"],title="dropout_tsne")
sc.pl.tsne(adata_true,color=["Group"],title="true_tsne")

sc.tl.umap(adata)
sc.tl.umap(adata_true)
sc.pl.umap(adata,color=["Group"],title="dropout_umap")
sc.pl.umap(adata_true,color=["Group"],title="true_out")

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述可以看到这结果的tsne看起来是不一样的,这个结果很奇怪,Rtsne有两个cluster竟然显示出来了,而python中的tsne直接混合了数据

我首先排除了可能是Rtsne的原因

# X_raw=adata.X.copy()
# Y=adata.obs["Group"].cat.codes.values

X_raw=adata_true.X.copy()
Y=adata_true.obs["Group"].cat.codes.values

%%R -i X_raw -i Y
tsne.X <- Rtsne(X_raw, max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

结果如下
在这里插入图片描述所以唯一的问题,就是这个预处理的过程
R的预处理流程如下
在这里插入图片描述而scanpy的处理流程如下
在这里插入图片描述根据我前几天刚看的sc.pp.normal_per_cell的实现原理,里面涉及到中位数,但是R中的标准化并没有,也就是说scanpy得到的结果可能会更稳定,这也就是结果的原因
我验证了这个结果,我在R中实现了中位数的标准化,

%%R -o counts -o truecounts -o geneinfo -o cellinfo

# make sure that splatter is installed: https://github.com/Oshlack/splatter
suppressPackageStartupMessages({
  library(splatter)
  library(Rtsne)
  library(ggplot2)
  library(repr)
})


simulate <- function(nGroups=2, nGenes=200, batchCells=2000, dropout=3)
{
    if (nGroups > 1) method <- 'groups'
    else             method <- 'single'

    group.prob <- rep(1, nGroups) / nGroups

    # new splatter requires dropout.type
    if ('dropout.type' %in% slotNames(newSplatParams())) {
        if (dropout)
            dropout.type <- 'experiment'
        else
            dropout.type <- 'none'
        
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.type=dropout.type, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)

    } else {
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.present=!dropout, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)        
    }

    counts     <- as.data.frame(t(counts(sim)))
    truecounts <- as.data.frame(t(assays(sim)$TrueCounts))

    dropout    <- assays(sim)$Dropout
    mode(dropout) <- 'integer'

    cellinfo   <- as.data.frame(colData(sim))
    geneinfo   <- as.data.frame(rowData(sim))

    list(counts=counts,
         cellinfo=cellinfo,
         geneinfo=geneinfo,
         truecounts=truecounts)
}
sim <- simulate(nGroups=6, dropout=1) ## 因为这个地方的dropout从1变成了3,导致两者不一致

counts <- sim$counts
geneinfo <- sim$geneinfo
cellinfo <- sim$cellinfo
truecounts <- sim$truecounts

X <- t(counts) ## counts with dropout
Y <- as.integer(substring(cellinfo$Group,6))
Y <- Y-1

#X.normalized <- apply(X, 2, function(z) z/sum(z))
X_t=t(X)
me=median(apply(X_t,1,sum))
sf=me/apply(X_t,1,sum)
X.normalized=diag(sf)%*%X_t

X.normalized <- log(X.normalized + 1)
tsne.X <- Rtsne(X.normalized, max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

# sim <- simulate()

# counts <- sim$counts
# geneinfo <- sim$geneinfo
# cellinfo <- sim$cellinfo
# truecounts <- sim$truecounts
## 我也可以在R中显示的

结果如下
在这里插入图片描述

最后

以上就是犹豫香水为你收集整理的Rtsne的问题总结visulize in python的全部内容,希望文章能够帮你解决Rtsne的问题总结visulize in python所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(41)

评论列表共有 0 条评论

立即
投稿
返回
顶部