GSE以128033为例,处理多个大数据集seurat首先创建多个对象seurat对象 再变为一个list 通过findanchor整合合并 最终获得超大数据集合 Proliferating SPP1/MERTK-expressing macrophages in idiopathic pulmonary fibrosis
https://satijalab.org/seurat/archive/v3.0/integration.html
#教程地址 #https://cloud.tencent.com/developer/article/1697249 #https://bioconductor.org/packages/release/data/experiment/vignettes/scRNAseq/inst/doc/scRNAseq.html #https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzI1Njk4ODE0MQ==&action=getalbum&album_id=1326587538303434752&scene=173&from_msgid=2247484689&from_itemidx=1&count=3&nolastread=1#wechat_redirect rm(list = ls()) Sys.setenv(R_MAX_NUM_DLLS=999) options(stringsAsFactors = F) ·```
########################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################duoge数据 准备原始分析数据 先手动下载 下载文件到浏览器自己的文件夹,然后解压 #https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi
一个样本,一个文件。需要分成文件夹命名
对geo解压下载的文件 获取以下文件 处理上述文件 分类。需要分成文件夹并重命名
getwd() path="D:/yll/inflammatory-singlecells/GSE128033/GSE128033" #空间转录组 dir.create(path) setwd(path) getwd()
工作目录设置错误时候,出现一下结果 因此,更改工作目录
fs=list.files('./','^GSM') #得到当前目录的一切GSM开头的文件名称 fs
自行下载GSE128033数据集的GSE128033压缩包并解压哦,这样上面的代码就可以运行了
然后获取样本信息,因为它是批量的,所以下面的代码可能不容易理解,需要掌握R语言
library(stringr) samples=str_split(fs,'_',simplify = T)[,1] #取出可分组的样本名 得到样本分组 samples
getwd() ###注意workshop决定下一句话是否成功的位置! setwd("D:/yll/inflammatory-singlecells/GSE128033/")
lapply(unique(samples),function(x){ #x=unique(samples)[1] y=fs[grepl(x,fs)] folder=paste0("GSE128033/", str_split(y[1],'_',simplify = T)[,1]) dir.create(folder,recursive = T) #创建每个样本的子文件夹 file.rename(paste0("GSE128033/",y[1]),file.path(folder,"barcodes.tsv.gz")) #重命名文件,移动到相应的子文件夹 file.rename(paste0("GSE128033/",y[2]),file.path(folder,"features.tsv.gz")) file.rename(paste0("GSE128033/",y[3]),file.path(folder,"matrix.mtx.gz")) })
成功移动文件
如下: 每个文件夹下的命名如下:
getwd() samples=list.files("GSE128033/") samples# 是多个文件夹的名称
samples=list.files("GSE128033/") %>% grep(pattern = "gz",invert = TRUE,value = TRUE) samples# 是两个文件夹的名字
2#创建Seurat对象 library(Seurat) getwd() # 循环读取两个文件夹下的10x的的3个文件 sceList = lapply(samples,function(po){
#pro="GSM3660641"
folder=file.path("GSE128033/",pro)
CreateSeuratObject(counts = Read10X(folder), #此处一定要加上筛选条件
project = pro ,min.cells=3,min.features=200)
##print(paste0("进行到第",pro,"个样本")) 不可以打印这一排
})
for (i in 1:length(sceList)) {#计算线粒体比例
sceList[[i]][["percent.mt"]]=PercentageFeatureSet(sceList[[i]],pattern = "^MT-")
}
但是是不是应该进行质量控制 再进行下一步呢 否则,现在不进行质量控制的话最后的结果就会变成下面这样
此处可以写个循环来对list之内的每个seurat对象进行质控,那么双细胞要不要也去除呢
# 较接近原文过滤后的数量2149,但感觉条件有点宽松了
# 网上看了相关教程,一般ERCC占比不高于10%!
sum(scRNA$percent.ERCC< 10) #就只剩下460个cell,明显低于文献中的数量
pctERCC=40
scRNA <- subset(scRNA, subset = percent.ERCC < pctERCC)
dim(scRNA)
# >20047 2142 #原文为19752 2149
#质量控制
features = c("nFeature_RNA","nCount_RNA","percent.mt"))
>200 <20
# Quality Control figure
VlnPlot(norandipf.integrated,
features = c("nFeature_RNA","nCount_RNA","percent.mt"))
head(norandipf.integrated@meta.data)
p1=FeatureScatter(norandipf.integrated,feature1 = "nCount_RNA",feature2 = "percent.mt")
p2=FeatureScatter(norandipf.integrated,feature1 = "nCount_RNA",feature2 = "nFeature_RNA")
p1+p2
sceList DefaultAssay(sceList)
getwd() save(sceList,file=“D:/yll/inflammatory-singlecells/GSE128033/scelist.rdata” )
sceList[[1]]@project.name
for (i in 1:length(sceList)) {#查看list之内pro与index的对应关系 #i=1 print(paste0(sceList[[i]]@project.name,paste0(“:这是第”,i,“个”))) }
#取自己感兴趣的集合
mysamples=c("GSM3660645","GSM3660646","GSM3660647","GSM3660648",
"GSM3660651","GSM3660652",
"GSM3660654","GSM3660655",
"GSM3660657","GSM3660658")
norandipf.list=list()
for (i in 1:length(sceList)) {#取出想要的seurat对象,并给列表也命名了
#i=1
if (sceList[[i]]@project.name %in% mysamples) {
norandipf.list[[paste(sceList[[i]]@project.name)]]=sceList[[i]]
}
print(paste0(sceList[[i]]@project.name,paste0(":这是第",i,"个")))
}
names(norandipf.list)
head(norandipf.list[[1]]@meta.data)
##这一步就没必要了,因为初始读入seurat对象时候已经筛选过了
for (i in names(norandipf.list)) {
#i="GSM3660645"
norandipf.list[[i]]=subset(norandipf.list[[i]],#过滤掉基因表达为0的细胞
subset = nCount_RNA> 0 & nFeature_RNA>0)
}
https://satijalab.org/seurat/archive/v3.0/integration.html
# normalize and identify variable features for each dataset independently
norandipf.list<- lapply(X = norandipf.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
'''
#此处省略该步骤sct转化 比较耗时 因为是大数据,此处为了下一步的整合分析
#memory.limit(size = 35000)
for (i in names(norandipf.list)) {
#i="GSM3660645"
norandipf.list[[i]]=SCTransform(norandipf.list[[i]],verbose = FALSE)
}
'''
#选择锚定基因
DefaultAssay(norandipf.list[[1]])
features=SelectIntegrationFeatures(object.list = norandipf.list)
norandipf.list <- lapply(X = norandipf.list, FUN = function(x) {#注意等号=和<-的区别
x <- ScaleData(x, features = features, verbose = FALSE)
x <- RunPCA(x, features = features, verbose = FALSE)
})
'''
#预整合 此步骤省略
norandipf.list=PrepSCTIntegration(object.list = norandipf.list,
anchor.features =norandipf.features )
'''
#在找锚点合并之前,需要把每个seurat对象的细胞名改变成唯一
getwd()
#save(norandipf.list,file = "G:/silicosis/geo/GSE128033_SnRNAseq-idiopathic pulmonary fibrosis/norandipf.list.rds")
Idents(norandipf.list[[1]])
names(norandipf.list[[1]])
#new_obj <- RenameCells(obj, new.names=gsub("-1", "_1", colnames(obj)))
for (i in names(norandipf.list)) {
norandipf.list[[i]]=RenameCells(norandipf.list[[i]],
new.names =sub(pattern = "1",
replacement =str_split(pattern = "0",
names(norandipf.list)[str_detect(names(norandipf.list),pattern = i)],
simplify = TRUE)[2],
x=colnames(norandipf.list[[i]])) )
}
#选择参考数据集,这里选择51 52 因为他俩细胞数最多 这里使用的reference是个向量!!!而不是seurat对象!
reference_dataset=which(names(norandipf.list)==c("GSM3660651","GSM3660652"))
#reference_dataset=c(norandipf.list[["GSM3660651"]],norandipf.list[["GSM3660652"]])
#找锚点
norandipf.anchors=FindIntegrationAnchors(object.list = norandipf.list,
reduction = "rpca",
anchor.features = features,
reference =reference_dataset )#c(5,6)
#整合
DefaultAssay(norandipf.list[[1]]) #[1] "RNA"
norandipf.integrated=IntegrateData(anchorset = norandipf.anchors,
dims = 1:30) %>%
ScaleData(verbose=FALSE) %>%
RunPCA(verbose=FALSE) %>%
RunTSNE(verbose=FALSE)%>%
RunUMAP(dims=1:30)
DefaultAssay(norandipf.list[[1]]) DefaultAssay(norandipf.integrated) #Perform an integrated analysis #Now we can run a single integrated analysis on all cells DefaultAssay(norandipf.integrated) <- “integrated”
norandipf.integrated<-FindNeighbors(norandipf.integrated,reduction = "pca", dims = 1:30) %>%
FindClusters()
markers=FindAllMarkers(norandipf.integrated,only.pos = TRUE,min.pct = 0.3)
getwd()
#save(norandipf.integrated,file = "D:/yll/inflammatory-singlecells/GSE128033/Nor_and_IPF.Rdata")
load("D:/yll/inflammatory-singlecells/GSE128033/Nor_and_IPF.Rdata")
DimPlot(norandipf.integrated,group.by = "orig.ident")
DefaultAssay(norandipf.integrated)
norandipf.integrated<-FindNeighbors(norandipf.integrated,reduction = "pca", dims = 1:30) %>%
FindClusters()
markers=FindAllMarkers(norandipf.integrated,only.pos = TRUE,min.pct = 0.3)
DT::datatable(markers)
Idents(norandipf.integrated)
norandipf.integrated=RenameIdents(norandipf.integrated, “0”=“Club”,“11”=“AT1”, “16”=‘AT1’, ‘18’=‘AT2’,‘9’=‘ciliated cells’, ‘8’=‘Fibroblasts’,‘12’=‘Fibroblasts’, ‘14’=‘smooth muscle cells’, ‘5’=‘Endothelial cells’, ‘10’=‘other_Mes’,‘22’=‘other_Mes’,‘24’=‘other_Mes’,‘28’=‘other_Mes’, ‘23’=‘Lymphatic endothelial cells’, ‘1’=‘Macrophages’,‘3’=‘Macrophages’,‘7’=‘Macrophages’, ‘25’=‘dc’,‘19’=‘dc’, ‘6’=‘T’,‘13’=‘T’, ‘21’=‘B’,‘27’=‘other_B’, ‘17’=‘plasma’)
table(Idents(norandipf.integrated),norandipf.integrated$orig.ident)
3#过滤质控
#raw_sce=get(load(file = 'sce.big.merge.ls_12.Rdata'))
#线粒体基因
raw_sce=sce.big
rownames(raw_sce)[grepl('^mt-',rownames(raw_sce),ignore.case = T)]
#核糖体蛋白基因
rownames(raw_sce)[grepl('^Rp[sl]',rownames(raw_sce),ignore.case = T)]
#计算上述两种基因在所有细胞中的比例
raw_sce[["percent.mt"]] <- PercentageFeatureSet(raw_sce, pattern = "^MT-")
rb.genes <- rownames(raw_sce)[grep("^RP[SL]",rownames(raw_sce))]
C<-GetAssayData(object = raw_sce, slot = "counts") #计算核糖体比例时候使用
percent.ribo <- Matrix::colSums(C[rb.genes,])/Matrix::colSums(C)*100
raw_sce <- AddMetaData(raw_sce, percent.ribo, col.name = "percent.ribo")
plot1 <- FeatureScatter(raw_sce, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(raw_sce, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
VlnPlot(raw_sce, features = c("percent.ribo", "percent.mt"), ncol = 2)
VlnPlot(raw_sce, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
VlnPlot(raw_sce, features = c("percent.ribo", "nCount_RNA"), ncol = 2)
raw_sce #9644个
#按照三个指标过滤细胞
raw_sce1 <- subset(raw_sce, subset = nFeature_RNA > 200 & nCount_RNA > 1000 & percent.mt < 20)
raw_sce1 #7915个
dim(raw_sce1)
4#降维聚类
sce=raw_sce1
#counts归一化
sce <- NormalizeData(sce, normalization.method = "LogNormalize",
scale.factor = 10000)
GetAssay(sce,assay = "RNA")
#前2000个高变feature RNA
sce <- FindVariableFeatures(sce,
selection.method = "vst", nfeatures = 2000)
# 中心化,为下一步PCA做准备
sce <- ScaleData(sce)
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce))
#看看前12个主成分
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
#判断最终选取的主成分数,这里我判断17个
ElbowPlot(sce)
table(sce$orig.ident)
head(colnames(sce))
library(dplyr)
unlist(str_split(colnames(sce),"_")) %>% grep(pattern = "GSM5015",value = TRUE)
table(unlist(str_split(colnames(sce),"_")) %>% grep(pattern = "GSM5015",value = TRUE))
grep("^GSM5015042",colnames(sce))
head(colnames(sce))
sce$stim<-c(rep("GSM5015042",length(grep("GSM5015042",colnames(sce)))), #顺序是按照合并的顺序来的
rep("GSM5015043",length(grep("GSM5015043",colnames(sce)))),
rep("GSM5015044",length(grep("GSM5015044",colnames(sce)))))
table(sce$stim)
Idents(sce)
subset_data=sce
library('harmony')
subset_data <- subset_data %>% RunHarmony("stim", plot_convergence = TRUE)
harmony_embeddings <- Embeddings(subset_data, 'harmony')
#######################cluster
dims = 1:30
subset_data <- subset_data %>%
RunUMAP(reduction = "harmony", dims = dims) %>%
RunTSNE(reduction = "harmony", dims = dims) %>%
FindNeighbors(reduction = "harmony", dims = dims)
save(subset_data, file="mouse_lung_stim.rds")
load(file ="mouse_lung_stim.rds" )
subset_data@meta.data$stim
names(subset_data@meta.data)[6]
names(subset_data@meta.data)[6]<-"stim"
table(subset_data)
gsub('["]','',paste0(myfilename,".rds")) # 删除字符串a中的双引号
getwd()
file="G:/silicosis/geo/GSE164621_RNA-seq_wildtype_mouse_lung"
#"default"默认分辨率
myfilename="mouse_lung_stim"
library(xlsx)
for (r in c("_default_")) {
load(paste(file, gsub('["]','',paste0(myfilename,".rds")), sep="/"))
dir.create(paste(file,r,sep="/"))
setwd(paste(file,r,sep="/"))
subset_data <- FindClusters(subset_data)
save(subset_data, file=paste0(myfilename,"_r", r, ".rds"))
subset_data <- ScaleData(subset_data, verbose = FALSE, features = rownames(subset_data))
pdf(paste0("1_",myfilename,"_cluster_TSNE.pdf"))
p = DimPlot(subset_data, reduction = "tsne", label = TRUE, pt.size = 1,label.size = 6)
print(p)
dev.off()
pdf(paste0("1_",myfilename,"_cluster_UMAP.pdf"))
p = DimPlot(subset_data, reduction = "umap", label = TRUE, pt.size = 1,label.size = 6)
print(p)
dev.off()
for (i in c("umap","tsne")) {
pdf(paste0("2_",i,"_",myfilename,"_split.pdf"))
p = DimPlot(subset_data, reduction = i,
split.by = "stim",label = TRUE, pt.size = 1,label.size = 6)
print(p)
dev.off()
}
stat = cbind(table(Idents(subset_data), subset_data$stim), All = rowSums(table(Idents(subset_data), subset_data$stim)))
stat = rbind(stat, All = colSums(stat))
write.xlsx(stat, paste0("2_",myfilename,"_cluster_stat.xlsx"), col.names=T, row.names=T)
######################## find marker
markers <- FindAllMarkers(subset_data, min.pct = 0.25, logfc.threshold = 0.25, only.pos=T)
write.xlsx(markers,paste0(r,"_3_",myfilename,"_cluster_markers.xlsx"), col.names=T,row.names=T)
}
head(markers)
filetered_marker=markers %>% group_by(cluster) %>%filter(p_val_adj==0)
head(filetered_marker)
table(filetered_marker$cluster)
load("G:/silicosis/geo/GSE164621_RNA-seq_wildtype_mouse_lung/_default_/mouse_lung_stim_r_default_.rds")
###########
#######
######
##########
######
# 分成25个cluster
#tSNE可视化
set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:18, do.fast = TRUE)
DimPlot(sce,reduction = "tsne",label=T)
dim(sce)
# 在两个样本里25个cluster的分布情况
table(sce@meta.data$seurat_clusters,sce@meta.data$orig.ident)
DimPlot(sce,reduction = "tsne",label=T,split.by ='orig.ident')
#save(sce,file='merge_for_tSNE.pos.Rdata')
load(file="merge_for_tSNE.pos.Rdata")
#再尝试用ggplot2可视化上图
phe=data.frame(cell=rownames(sce@meta.data),
cluster =sce@meta.data$seurat_clusters)
phe
tsne_pos=Embeddings(sce,'tsne')
dat=cbind(tsne_pos,phe)
head(dat)
library(ggplot2)
p=ggplot(dat,aes(x=tSNE_1,y=tSNE_2,color=cluster))+geom_point(size=0.95)
p=p+stat_ellipse(data=dat,aes(x=tSNE_1,y=tSNE_2,fill=cluster,color=cluster),
geom = "polygon",alpha=0.2,level=0.9,type="t",linetype =2,show.legend = F)+coord_fixed()
print(p)
theme= theme(panel.grid =element_blank()) + ## 删去网格
theme(panel.border = element_blank(),panel.background = element_blank()) + ## 删去外层边框
theme(axis.line = element_line(size=1, colour = "black"))
p+theme+guides(colour = guide_legend(override.aes = list(size=5)))
#寻找每个cluster的高变代表基因,并选取前5个,进行可视化
table(sce@meta.data$seurat_clusters)
p <- list()
for( i in unique(sce@meta.data$seurat_clusters) ){
markers_df <- FindMarkers(object = sce, ident.1 = i, min.pct = 0.25)
print(x = head(markers_df))
markers_genes = rownames(head(x = markers_df, n = 4))
p1 <- VlnPlot(object = sce, features =markers_genes,log =T,ncol = 2)
p[[i]][[1]] <- p1
p2 <- FeaturePlot(object = sce, features=markers_genes,ncol = 2)
p[[i]][[2]] <- p2
}
p[[1]][[2]]
p[[2]][[1]]
#查阅所有的marker基因,可用于人工注释cell type
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
head(sce.markers)
#热图可视化每个cluster的marker基因表达差异
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)
#save(sce,sce.markers,file = 'sce.output.merge.ls_12.Rdata')
#load(file = "sce.output.merge.ls_12.Rdata")
DotPlot(sce,features = c("Hamp","Slc40a1","Mybphl","Fmo3","Dll4","Vegfa"))
FeaturePlot(sce,features = c("Hamp","Slc40a1","Mybphl","Fmo3"))
DotPlot(sce,features = c("Mybphl","Myl7","Myl4"))
makers14<-FindMarkers(sce,ident.1 = 14,
logfc.threshold = 0.25,
only.pos = TRUE,min.pct = 0.25)
DT::datatable(makers14)
DotPlot(sce,features = c("Acta2","Myl7","Mybphl","Aldh1b1","Got1","Cmya5","Srl","Ttn"))
5#clusters细胞类型注释
#我刚开始用SingleR是在19年的年底,现在使用方法可能有一些变化了。因为Single在注释细胞的过程中,会用到一些参考数据集,我们可以把这些数据集保存下来,下一次使用就可以直接加载,省去了重新下载参考集的时间。
library(SingleR)
sce_for_SingleR <- GetAssayData(sce, slot="data")
clusters=sce@meta.data$seurat_clusters
head(clusters)
#BiocManager::install('celldex')
library('celldex')
mouseImmu <- ImmGenData()
#接下来是注释步骤,在这一步里,我只用了main大类注释,还有一个fine小类注释,我就不演示了,因为我觉得小类注释不太准
#注意使用main或者fine参数
pred.mouseImmu <- SingleR(test = sce_for_SingleR, ref = mouseImmu, labels = mouseImmu$label.main,
method = "cluster", clusters = clusters,
assay.type.test = "logcounts", assay.type.ref = "logcounts")
mouseRNA <- MouseRNAseqData()
pred.mouseRNA <- SingleR(test = sce_for_SingleR, ref = mouseRNA, labels = mouseRNA$label.main ,
method = "cluster", clusters = clusters, # #因为样本主要为免疫细胞(而不是全部细胞),因此设置为label.fine
assay.type.test = "logcounts", assay.type.ref = "logcounts")
cellType=data.frame(ClusterID=levels(sce@meta.data$seurat_clusters),
mouseImmu=pred.mouseImmu$labels,
mouseRNA=pred.mouseRNA$labels )
head(cellType)
sce@meta.data$singleR=cellType[match(clusters,cellType$ClusterID),'mouseRNA']
pro='first_anno'
DimPlot(sce,reduction = "tsne",label=T, group.by = 'singleR')
ggplot2::ggsave(filename = paste0(pro,'_tSNE_singleR.pdf'))
DimPlot(sce,reduction = "tsne",label=T,split.by ='orig.ident',group.by = 'singleR')
ggplot2::ggsave(filename = paste0(pro,'_tSNE_singleR_by_orig.ident.pdf'))
save(sce,file = 'last_sce.Rdata')
table(pred.mouseRNA$labels)
plotScoreHeatmap(pred.mouseRNA)
#tSNE可视化
celltype = data.frame(ClusterID=rownames(pred.mouseRNA), celltype=pred.mouseRNA$labels, stringsAsFactors = F)
#如下为sce对象注释细胞cluster鉴定结果。
sce@meta.data$celltype = "NA"
#先新增列celltype,值均为NA,然后利用下一行代码循环填充
for(i in 1:nrow(celltype)){
sce@meta.data[which(sce@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
DimPlot(sce, group.by="celltype", label=T, label.size=5, reduction='tsne')
#利用ggplot2再可视化上图
phe=data.frame(cell=rownames(sce@meta.data),
cluster =clusters,
celltype=sce@meta.data$celltype)
tsne_pos=Embeddings(sce,'tsne')
dat=cbind(tsne_pos,phe)
head(dat)
p=ggplot(dat,aes(x=tSNE_1,y=tSNE_2,color=celltype))+geom_point(size=0.95)
p=p+stat_ellipse(data=dat,aes(x=tSNE_1,y=tSNE_2,fill=celltype,color=celltype),
geom = "polygon",alpha=0.2,level=0.9,type="t",linetype = 2,show.legend = F)+coord_fixed()
print(p)
theme= theme(panel.grid =element_blank()) + ## 删去网格
theme(panel.border = element_blank(),panel.background = element_blank()) + ## 删去外层边框
theme(axis.line = element_line(size=1, colour = "black"))
p+theme+guides(colour = guide_legend(override.aes = list(size=5)))