10X单细胞转录组测序—常规流程

10X单细胞转录组测序—常规流程

首页枪战射击Project RIP更新时间:2024-04-26

文章中其中用到的文件获取方式https://pan.baidu.com/s/1HCsHRXNX9Il8u8RIPXSEug?pwd=2626

1.上游分析1.1 安装conda、sratoolkit、cellranger

#安装conda,最好去官网下载最新版 cd ~ wget https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh bash Anaconda3-2021.11-Linux-x86_64.sh #安装sratoolkit,用于下载sra数据,同时也会安装fastq-dump #去官网下载最新版本,conda安装的不好用,https://github.com/ncbi/sra-tools/wiki/01.-Downloading-SRA-Toolkit #conda install -c daler sratoolkit #安装cellranger #官网下载最新版cellranger https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest #执行以下命令进行安装 tar -xzvf cellranger-6.1.2.tar #解压完即可使用,但需要添加到环境变量 1.2 使用conda进行常用软件安装

conda install -y -c bioconda aspera-cli bwa samtools bedtools bowtie2 fasterq-dump hisat2 cutadapt multiQC 1.3 参考基因组下载

##1.2 mm10 wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz md5sum refdata-gex-mm10-2020-A.tar.gz #886eeddde8731ffb58552d0bb81f533d refdata-gex-mm10-2020-A.tar.gz tar -xzvf refdata-gex-mm10-2020-A.tar.gz ##1.3 hg38 wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz md5sum refdata-gex-GRCh38-2020-A.tar.gz #dfd654de39bff23917471e7fcc7a00cd refdata-gex-GRCh38-2020-A.tar.gz tar -xzvf refdata-gex-GRCh38-2020-A.tar.gz 1.4 文件夹创建

mkdir 1.sra 2.raw_fastq 3.cellranger 1.5 数据下载

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112302 #SRA Run Selector里面选择需要的数据下载,在该篇文章中是下载6个tumor数据。选择要下载的数据样本后点击Accession List获取相应的id号。并将其命名为id,保存在1.sra文件夹中。 cd 1.sra cat id | while read id;do (prefetch $id &);done 1.6 SRA转fastq

cd 2.fastq ln -s ../1.sra/SRR* ./ ls SRR* |while read id;do (nohup fasterq-dump -O ./ --split-files -e 6 ./$id --include-technical & );done #上一步运行完会非常占用空间,可压缩节省空间。 ls SRR*fastq | while read id;do gzip $id;done 1.7 cellranger count流程

##修改文件名,改成cellranger可识别的文件名。 cat ../1.sra/id |while read id ;do (mv ${id}_1*.gz ${id}_S1_L001_I1_001.fastq.gz;mv ${id}_2*.gz ${id}_S1_L001_R1_001.fastq.gz;mv ${id}_3*.gz ${id}_S1_L001_R2_001.fastq.gz);done ##运行cellranger #指定参考基因组 ref=/home/data/vip10t17/software_install/10x_refernce/refdata-gex-GRCh38-2020-A ls *.fastq.gz | cut -d "_" -f 1 |uniq |while read id;do cellranger count --id $id --transcriptome $ref --fastqs 2.raw.fastq/ --sample $id --nosecondary --localcores 10 --localmem 30;done ##参数解读 --id 指定输出文件夹的名字 --transcriptome 指定参考基因组的路径 --sample 指定需要处理的fastq文件的前缀 --expect-cell 指定预期的细胞数目,默认参数是3000个 --localcores 指定计算的核心数 --mempercore 指定内存大小 GB --nosecondary 不需要进行降维聚类(因为后期会用R可视化) --chemistry,默认是自动识别chemistry,但是有些时候识别不出chemistry的时候,需要加入参数特别标明 1.8 结果解读

将所有结果中的filtered_gene_bc_matrices文件夹保存在自己电脑output文件夹下,进行下游分析

2. 下游分析2.1 R包安装、加载

library(Seurat) library(ggplot2) library(clustree) library(cowplot) library(dplyr) library(harmony) library(SingleR) library(celldex) library(EnhancedVolcano) library(tidyverse) library(ggrepel) library(rgl) library(fgsea) library(clustree) library(patchwork) 2.2 数据读取过滤

#设置工作路径 setwd('E:/project/test/') dir='E:/project/test/cellranger' #设置样本名 samples=c("100","400") samples #获取单细胞数据 sceList = lapply(samples,function(pro){ print(pro) sce=CreateSeuratObject( Read10X(file.path(dir,pro)), project = pro, min.cells = 5, min.features = 200) return(sce) }) #会自动读取output文件夹下所有的文件,同时进行数据过滤, #一个基因最少细胞表达数为5,一个细胞最少基因表达量为300。 #合并数据 PBMC.all=merge(x=sceList[[1]], y=sceList[[2]], add.cell.ids = c("100","400")) #计算^MT-线粒体基因所占的比例,人:全大写,小鼠:全小写 PBMC.all[["percent.mt"]] = PercentageFeatureSet(PBMC.all,pattern = "^mt-") #计算血红细胞相关基因比例,人:全大写,小鼠:首字母大写 其余小写 PBMC.all[["percent.hb"]] = PercentageFeatureSet(PBMC.all,pattern = "^Hb[^(p)]") #分析核糖体基因。人:全大写,小鼠:首字母大写 其余小写 PBMC.all[["percent.rp"]] = PercentageFeatureSet(PBMC.all,pattern = "^Rp[sl]") Vlnplot(PBMC.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.hb", "percent.rp"), ncol = 3) #其中nFeature_RNA:一个细胞中检测到的基因总数,nCount_RNA:一个细胞中检测到的分子数 #根据数据的分布进行过滤,主要过滤极值,其中mt部分一般是小于10,但也不绝对 #肌肉细胞的线粒体基因占比最高可达50%,肿瘤区域的正常细胞线粒体含量有时也在30%以上 #但为了尽可能保留更多的数据可以提高这个数值,HB部分一般是小于3。 PBMC.all = subset(PBMC.all,subset = nFeature_RNA<7500&percent.mt<15&percent.hb<3&percent.rp<50) #通过质控检测数据过滤的好坏 plot1 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "percent.mt") NoLegend() plot1#会呈现出三角的形成 plot2 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "percent.hb") NoLegend() plot2#会呈现成横线状 plot3 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "nFeature_RNA") NoLegend() plot3#会呈现出圆弧状 #将三张图合并显示 CombinePlots(plots = list(plot1, plot2,plot3)) #计算基因reads数贡献值 C <- PBMC.all@assays$RNA@counts C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100 most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1] boxplot(t(as.matrix(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell", col = (scales::hue_pal())(20)[20:1], horizontal = TRUE) #结果中根据占比,删除占比较大的基因,其中MALAT1的高检测可能是一个技术问题 dim(PBMC.all) # 过滤MALAT1基因 PBMC.all <- PBMC.all[!grepl("Malat1", rownames(PBMC.all)), ] # 过滤线粒体基因 PBMC.all <- PBMC.all[!grepl("^mt-", rownames(PBMC.all)), ] # 过滤核糖体基因 PBMC.all <- PBMC.all[!grepl('^Rp[sl]', rownames(PBMC.all)), ] # 过滤血红蛋白基因 PBMC.all <- PBMC.all[!grepl("^Hb[^(p)]", rownames(PBMC.all)), ] dim(PBMC.all) #PBMC.all1用于保存原始数据,方便后续调用 saveRDS(PBMC.all, "E:/project/test/PBMC.all1.rds") 2.3 标准化

#标准化 通过总表达值对每个细胞的基因表达值归一化 PBMC.all <- NormalizeData(PBMC.all, normalization.method = "LogNormalize", scale.factor = 1e4) 2.4 寻找高变基因

#寻找差异基因 PBMC.all = FindVariableFeatures(PBMC.all) # 选择前10个,并将前10的高变异基因在图中标注出来 top10 <- head(VariableFeatures(PBMC.all), 10) plot4 = VariableFeaturePlot(object = PBMC.all) NoLegend() plot4 plot5 <- LabelPoints(plot = plot4, points = top10, repel = TRUE) plot5 #合并显示 CombinePlots(plots = list(plot4, plot5)) 2.5 (可选)可通过以下方法修改组织样本信息

timePoints <-ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722939', 'PBMC_Pre', ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722940', 'PBMC_EarlyD27', ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722941'|PBMC.all@meta.data[["orig.ident"]] == 'SRR7722942', 'PBMC_RespD376', 'PBMC_ARD614'))) #PBMC <- AddMetaData(object = PBMC.all, metadata = apply(dataPBMC, 2, sum), col.name = 'nUMI_raw') PBMC.all <- AddMetaData(object = PBMC.all, metadata = timePoints, col.name = 'TimePoints') 2.6 细胞周期分析

#细胞周期分析 mouse_cell_cycle_genes <- readRDS("E:/project/celldex数据库/mouse_cell_cycle_genes.rds") s.genes=mouse_cell_cycle_genes[[1]] g2m.genes=mouse_cell_cycle_genes[[2]] PBMC.all <- CellCycleScoring(PBMC.all, s.features = s.genes, g2m.features = g2m.genes) mouse_cellmarker=read.csv("E:/project/cellmarker数据库/Cell_marker_Mouse.csv",header = TRUE) mouse_cellmarker=unique(mouse_cellmarker$Symbol) 2.7 PCA分析

#使用scaleData改变每个基因的表达,使细胞间的平均表达为0;缩放每个基因的表达, #使细胞间的差异为1,这样高表达基因就不会影响后续分析 #这是pca等降维技术之前的标准预处理步骤,同时去除细胞周期相关基因 PBMC.all = ScaleData(PBMC.all, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(PBMC.all),split.by = "orig.ident") PBMC.all <- RunPCA(PBMC.all, npcs=30,features = mouse_cellmarker,verbose = FALSE) #其中npcs要设置的大一点,在后面主成分分析的时候会受限于这个数值 #使用Harmony进行整合,消除样本间的批次效应 PBMC.all <- RunHarmony(PBMC.all,'orig.ident') #查看去除批次效应前后的结果 DimPlot(PBMC.all,reduction = "pca",group.by = "orig.ident") NoLegend() DimPlot(PBMC.all,reduction = "harmony",group.by = "orig.ident") NoLegend() #用热图查看前500个细胞在前18个PCA中的热图,但分群较多,分三组展示 DimHeatmap(PBMC.all, dims = 1:6, cells = 500, balanced = TRUE) DimHeatmap(PBMC.all, dims = 7:12, cells = 500, balanced = TRUE) DimHeatmap(PBMC.all, dims = 13:18, cells = 500, balanced = TRUE) #PBMC.all3用于亚群数分析,确定resolution saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all3.rds") #PBMC.all=readRDS("E:/project/lc_test/PBMC.all3.rds") 2.8 非线性降维

#使用确定好的resolution进行后续分析 PBMC.all = FindNeighbors(PBMC.all,reduction = "harmony",dims = 1:30) PBMC.all <- FindClusters(object = PBMC.all,method = "igraph",resolution = 0.1) #进行TSNE和UMAP降维 #若前期进行了harmony整合,需要设置reduction = "harmony", #若前期没有进行harmony整合,需要设置reduction = "pca" PBMC.all <- RunTSNE(PBMC.all,dims = 1:30,reduction = "harmony") PBMC.all <- RunUMAP(PBMC.all,dims = 1:30,reduction = "harmony") #TSNE和UMAP降维结果可视化 #可以传入split.by = "orig.ident",按照样品名进行拆分再可视化, #也可以传入group.by = "orig.ident",按照样品名进行整体可视化。以查看 #之前进行的harmony整合是否合理 DimPlot(PBMC.all, reduction = "umap",label = T) DimPlot(PBMC.all, reduction = "umap", split.by = "orig.ident") DimPlot(PBMC.all, reduction = "umap", group.by = "orig.ident") DimPlot(PBMC.all, reduction = "tsne",label = T) DimPlot(PBMC.all, reduction = "tsne", split.by = "orig.ident") DimPlot(PBMC.all, reduction = "tsne", group.by = "orig.ident") 2.9 分析各亚群的marker基因

#确定每个分群相应的marker基因 markers = FindAllMarkers(PBMC.all,only.pos = TRUE,min.pct = 0.1, logfc.threshold = 0.2,return.thresh = 0.01) saveRDS(markers, "E:/project/lc_test/markers.rds") top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) DoHeatmap(PBMC.all, features = top10$gene) #可分析特定基因在降维图中的表达情况 FeaturePlot(PBMC.all,features = "GAPDH") 2.10 singleR 预测亚群名称

#为singleR获取可识别的数据集 PBMC.all_for_SingleR <- GetAssayData(PBMC.all, slot="data") clusters=PBMC.all@meta.data$seurat_clusters #singleR预测亚群名称 #获取用来自动预测亚群的数据库,可用的数据库在celldex R包里 #开始预测 load("E:/project/celldex数据库/MouseRNAseqData.Rdata") pred.mouseImmu_M <- SingleR(test = PBMC.all_for_SingleR, ref = MouseRNAseqData, labels = MouseRNAseqData$label.main, method = "cluster", clusters = clusters, assay.type.test = "logcounts", assay.type.ref = "logcounts") print(plotScoreHeatmap(pred.mouseImmu_M,show_colnames =TRUE)) load("E:/project/celldex数据库/ImmGenData.Rdata") pred.mouseImmu_I <- SingleR(test = PBMC.all_for_SingleR, ref = ImmGenData, labels = ImmGenData$label.main, method = "cluster", clusters = clusters, assay.type.test = "logcounts", assay.type.ref = "logcounts") print(plotScoreHeatmap(pred.mouseImmu_I,show_colnames =TRUE)) #获取预测结果 cellType=data.frame(ClusterID=levels(PBMC.all@meta.data$seurat_clusters), ref.se=pred.mouseImmu_M$labels) #可通过 fix(cellType) 对亚群名称进行修改 #将预测到的细胞亚群传给原始数据集 new.cluster.ids <- cellType$ref.se names(new.cluster.ids) <- levels(PBMC.all) PBMC.all <- RenameIdents(PBMC.all, new.cluster.ids) DimPlot(PBMC.all, reduction = 'umap', label = TRUE, pt.size = 0.5) ###若分群不明显,可将所有细胞分为肿瘤细胞和免疫细胞,对免疫细胞重新进行分析,也可通过marker基因的方式,人工确定亚群名称,也可结合数据库预测。 saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all4.rds")

查看全文
大家还看了
也许喜欢
更多游戏

Copyright © 2024 妖气游戏网 www.17u1u.com All Rights Reserved