10X单细胞转录组测序—常规流程（Project RIP）

文章中其中用到的文件获取方式https://pan.baidu.com/s/1HCsHRXNX9Il8u8RIPXSEug?pwd=2626

1.上游分析1.1 安装conda、sratoolkit、cellranger

#安装conda，最好去官网下载最新版 cd ~ wget https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh bash Anaconda3-2021.11-Linux-x86_64.sh #安装sratoolkit，用于下载sra数据，同时也会安装fastq-dump #去官网下载最新版本，conda安装的不好用，https://github.com/ncbi/sra-tools/wiki/01.-Downloading-SRA-Toolkit #conda install -c daler sratoolkit #安装cellranger #官网下载最新版cellranger https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest #执行以下命令进行安装 tar -xzvf cellranger-6.1.2.tar #解压完即可使用，但需要添加到环境变量1.2 使用conda进行常用软件安装

conda install -y -c bioconda aspera-cli bwa samtools bedtools bowtie2 fasterq-dump hisat2 cutadapt multiQC1.3 参考基因组下载

##1.2 mm10 wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz md5sum refdata-gex-mm10-2020-A.tar.gz #886eeddde8731ffb58552d0bb81f533d refdata-gex-mm10-2020-A.tar.gz tar -xzvf refdata-gex-mm10-2020-A.tar.gz ##1.3 hg38 wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz md5sum refdata-gex-GRCh38-2020-A.tar.gz #dfd654de39bff23917471e7fcc7a00cd refdata-gex-GRCh38-2020-A.tar.gz tar -xzvf refdata-gex-GRCh38-2020-A.tar.gz1.4 文件夹创建

mkdir 1.sra 2.raw_fastq 3.cellranger1.5 数据下载

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112302 #SRA Run Selector里面选择需要的数据下载，在该篇文章中是下载6个tumor数据。选择要下载的数据样本后点击Accession List获取相应的id号。并将其命名为id，保存在1.sra文件夹中。 cd 1.sra cat id | while read id;do (prefetch $id &);done1.6 SRA转fastq

cd 2.fastq ln -s ../1.sra/SRR* ./ ls SRR* |while read id;do (nohup fasterq-dump -O ./ --split-files -e 6 ./$id --include-technical & );done #上一步运行完会非常占用空间，可压缩节省空间。 ls SRR*fastq | while read id;do gzip $id;done1.7 cellranger count流程

##修改文件名，改成cellranger可识别的文件名。 cat ../1.sra/id |while read id ;do (mv ${id}_1*.gz ${id}_S1_L001_I1_001.fastq.gz;mv ${id}_2*.gz ${id}_S1_L001_R1_001.fastq.gz;mv ${id}_3*.gz ${id}_S1_L001_R2_001.fastq.gz);done ##运行cellranger #指定参考基因组 ref=/home/data/vip10t17/software_install/10x_refernce/refdata-gex-GRCh38-2020-A ls *.fastq.gz | cut -d "_" -f 1 |uniq |while read id;do cellranger count --id $id --transcriptome $ref --fastqs 2.raw.fastq/ --sample $id --nosecondary --localcores 10 --localmem 30;done ##参数解读 --id 指定输出文件夹的名字 --transcriptome 指定参考基因组的路径 --sample 指定需要处理的fastq文件的前缀 --expect-cell 指定预期的细胞数目，默认参数是3000个 --localcores 指定计算的核心数 --mempercore 指定内存大小 GB --nosecondary 不需要进行降维聚类（因为后期会用R可视化） --chemistry，默认是自动识别chemistry，但是有些时候识别不出chemistry的时候，需要加入参数特别标明1.8 结果解读

web_summary.HTML：必看，官方说明 summary HTML file ，包括许多QC指标，预估细胞数，比对率等；
metrics_summary.CSV：CSV格式数据摘要，可以不看；
possorted_genome_bam.bam：比对文件，用于可视化比对的reads和重新创建FASTQ文件，可以不看；
possorted_genome_bam.bam.bai：索引文件；
filtered_gene_bc_matrices：是重要的一个目录，下面又包含了 barcodes.tsv.gz、features.tsv.gz、matrix.mtx.gz，是下游Seurat、Scater、Monocle等分析的输入文件，是经过Cell Ranger过滤后构建矩阵所需要的所有文件；
filtered_feature_bc_matrix.h5：过滤掉的barcode信息HDF5 format，可以不看；
raw_feature_bc_matrix：原始barcode信息，未过滤的可以用于构建矩阵的文件，可以不看；
raw_feature_bc_matrix.h5：原始barcode信息HDF5 format，可以不看；
analysis：数据分析目录，下面又包含聚类clustering（有graph-based & k-means）、差异分析diffexp、主成分线性降维分析pca、非线性降维tsne，因为我们自己会走Seurat流程，所以不用看；
molecule_info.h5：可用于整合多样本，使用cellranger aggr函数；
cloupe.cloupe：官方可视化工具Loupe Cell Browser 输入文件，无代码分析的情况下使用，会代码的同学通常用不到。

将所有结果中的filtered_gene_bc_matrices文件夹保存在自己电脑output文件夹下，进行下游分析

2. 下游分析2.1 R包安装、加载

library(Seurat) library(ggplot2) library(clustree) library(cowplot) library(dplyr) library(harmony) library(SingleR) library(celldex) library(EnhancedVolcano) library(tidyverse) library(ggrepel) library(rgl) library(fgsea) library(clustree) library(patchwork)2.2 数据读取过滤

#设置工作路径 setwd('E:/project/test/') dir='E:/project/test/cellranger' #设置样本名 samples=c("100","400") samples #获取单细胞数据 sceList = lapply(samples,function(pro){ print(pro) sce=CreateSeuratObject( Read10X(file.path(dir,pro)), project = pro, min.cells = 5, min.features = 200) return(sce) }) #会自动读取output文件夹下所有的文件，同时进行数据过滤， #一个基因最少细胞表达数为5，一个细胞最少基因表达量为300。 #合并数据 PBMC.all=merge(x=sceList[[1]], y=sceList[[2]], add.cell.ids = c("100","400")) #计算^MT-线粒体基因所占的比例,人：全大写，小鼠：全小写 PBMC.all[["percent.mt"]] = PercentageFeatureSet(PBMC.all,pattern = "^mt-") #计算血红细胞相关基因比例，人：全大写，小鼠：首字母大写其余小写 PBMC.all[["percent.hb"]] = PercentageFeatureSet(PBMC.all,pattern = "^Hb[^(p)]") #分析核糖体基因。人：全大写，小鼠：首字母大写其余小写 PBMC.all[["percent.rp"]] = PercentageFeatureSet(PBMC.all,pattern = "^Rp[sl]") Vlnplot(PBMC.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.hb", "percent.rp"), ncol = 3) #其中nFeature_RNA：一个细胞中检测到的基因总数，nCount_RNA：一个细胞中检测到的分子数 #根据数据的分布进行过滤，主要过滤极值，其中mt部分一般是小于10，但也不绝对 #肌肉细胞的线粒体基因占比最高可达50%，肿瘤区域的正常细胞线粒体含量有时也在30%以上 #但为了尽可能保留更多的数据可以提高这个数值，HB部分一般是小于3。 PBMC.all = subset(PBMC.all,subset = nFeature_RNA<7500&percent.mt<15&percent.hb<3&percent.rp<50) #通过质控检测数据过滤的好坏 plot1 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "percent.mt") NoLegend() plot1#会呈现出三角的形成 plot2 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "percent.hb") NoLegend() plot2#会呈现成横线状 plot3 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA", feature2 = "nFeature_RNA") NoLegend() plot3#会呈现出圆弧状 #将三张图合并显示 CombinePlots(plots = list(plot1, plot2,plot3)) #计算基因reads数贡献值 C <- PBMC.all@assays$RNA@counts C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100 most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1] boxplot(t(as.matrix(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell", col = (scales::hue_pal())(20)[20:1], horizontal = TRUE) #结果中根据占比，删除占比较大的基因，其中MALAT1的高检测可能是一个技术问题 dim(PBMC.all) # 过滤MALAT1基因 PBMC.all <- PBMC.all[!grepl("Malat1", rownames(PBMC.all)), ] # 过滤线粒体基因 PBMC.all <- PBMC.all[!grepl("^mt-", rownames(PBMC.all)), ] # 过滤核糖体基因 PBMC.all <- PBMC.all[!grepl('^Rp[sl]', rownames(PBMC.all)), ] # 过滤血红蛋白基因 PBMC.all <- PBMC.all[!grepl("^Hb[^(p)]", rownames(PBMC.all)), ] dim(PBMC.all) #PBMC.all1用于保存原始数据，方便后续调用 saveRDS(PBMC.all, "E:/project/test/PBMC.all1.rds")2.3 标准化

#标准化通过总表达值对每个细胞的基因表达值归一化 PBMC.all <- NormalizeData(PBMC.all, normalization.method = "LogNormalize", scale.factor = 1e4)2.4 寻找高变基因

#寻找差异基因 PBMC.all = FindVariableFeatures(PBMC.all) # 选择前10个，并将前10的高变异基因在图中标注出来 top10 <- head(VariableFeatures(PBMC.all), 10) plot4 = VariableFeaturePlot(object = PBMC.all) NoLegend() plot4 plot5 <- LabelPoints(plot = plot4, points = top10, repel = TRUE) plot5 #合并显示 CombinePlots(plots = list(plot4, plot5))2.5 （可选）可通过以下方法修改组织样本信息

timePoints <-ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722939', 'PBMC_Pre', ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722940', 'PBMC_EarlyD27', ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722941'|PBMC.all@meta.data[["orig.ident"]] == 'SRR7722942', 'PBMC_RespD376', 'PBMC_ARD614'))) #PBMC <- AddMetaData(object = PBMC.all, metadata = apply(dataPBMC, 2, sum), col.name = 'nUMI_raw') PBMC.all <- AddMetaData(object = PBMC.all, metadata = timePoints, col.name = 'TimePoints')2.6 细胞周期分析

#细胞周期分析 mouse_cell_cycle_genes <- readRDS("E:/project/celldex数据库/mouse_cell_cycle_genes.rds") s.genes=mouse_cell_cycle_genes[[1]] g2m.genes=mouse_cell_cycle_genes[[2]] PBMC.all <- CellCycleScoring(PBMC.all, s.features = s.genes, g2m.features = g2m.genes) mouse_cellmarker=read.csv("E:/project/cellmarker数据库/Cell_marker_Mouse.csv",header = TRUE) mouse_cellmarker=unique(mouse_cellmarker$Symbol)2.7 PCA分析

#使用scaleData改变每个基因的表达，使细胞间的平均表达为0；缩放每个基因的表达， #使细胞间的差异为1，这样高表达基因就不会影响后续分析 #这是pca等降维技术之前的标准预处理步骤，同时去除细胞周期相关基因 PBMC.all = ScaleData(PBMC.all, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(PBMC.all),split.by = "orig.ident") PBMC.all <- RunPCA(PBMC.all, npcs=30,features = mouse_cellmarker,verbose = FALSE) #其中npcs要设置的大一点，在后面主成分分析的时候会受限于这个数值 #使用Harmony进行整合，消除样本间的批次效应 PBMC.all <- RunHarmony(PBMC.all,'orig.ident') #查看去除批次效应前后的结果 DimPlot(PBMC.all,reduction = "pca",group.by = "orig.ident") NoLegend() DimPlot(PBMC.all,reduction = "harmony",group.by = "orig.ident") NoLegend() #用热图查看前500个细胞在前18个PCA中的热图,但分群较多，分三组展示 DimHeatmap(PBMC.all, dims = 1:6, cells = 500, balanced = TRUE) DimHeatmap(PBMC.all, dims = 7:12, cells = 500, balanced = TRUE) DimHeatmap(PBMC.all, dims = 13:18, cells = 500, balanced = TRUE) #PBMC.all3用于亚群数分析，确定resolution saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all3.rds") #PBMC.all=readRDS("E:/project/lc_test/PBMC.all3.rds")2.8 非线性降维

#使用确定好的resolution进行后续分析 PBMC.all = FindNeighbors(PBMC.all,reduction = "harmony",dims = 1:30) PBMC.all <- FindClusters(object = PBMC.all,method = "igraph",resolution = 0.1) #进行TSNE和UMAP降维 #若前期进行了harmony整合，需要设置reduction = "harmony"， #若前期没有进行harmony整合，需要设置reduction = "pca" PBMC.all <- RunTSNE(PBMC.all,dims = 1:30,reduction = "harmony") PBMC.all <- RunUMAP(PBMC.all,dims = 1:30,reduction = "harmony") #TSNE和UMAP降维结果可视化 #可以传入split.by = "orig.ident"，按照样品名进行拆分再可视化， #也可以传入group.by = "orig.ident"，按照样品名进行整体可视化。以查看 #之前进行的harmony整合是否合理 DimPlot(PBMC.all, reduction = "umap",label = T) DimPlot(PBMC.all, reduction = "umap", split.by = "orig.ident") DimPlot(PBMC.all, reduction = "umap", group.by = "orig.ident") DimPlot(PBMC.all, reduction = "tsne",label = T) DimPlot(PBMC.all, reduction = "tsne", split.by = "orig.ident") DimPlot(PBMC.all, reduction = "tsne", group.by = "orig.ident")2.9 分析各亚群的marker基因

#确定每个分群相应的marker基因 markers = FindAllMarkers(PBMC.all,only.pos = TRUE,min.pct = 0.1, logfc.threshold = 0.2,return.thresh = 0.01) saveRDS(markers, "E:/project/lc_test/markers.rds") top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) DoHeatmap(PBMC.all, features = top10$gene) #可分析特定基因在降维图中的表达情况 FeaturePlot(PBMC.all,features = "GAPDH")2.10 singleR 预测亚群名称

#为singleR获取可识别的数据集 PBMC.all_for_SingleR <- GetAssayData(PBMC.all, slot="data") clusters=PBMC.all@meta.data$seurat_clusters #singleR预测亚群名称 #获取用来自动预测亚群的数据库，可用的数据库在celldex R包里 #开始预测 load("E:/project/celldex数据库/MouseRNAseqData.Rdata") pred.mouseImmu_M <- SingleR(test = PBMC.all_for_SingleR, ref = MouseRNAseqData, labels = MouseRNAseqData$label.main, method = "cluster", clusters = clusters, assay.type.test = "logcounts", assay.type.ref = "logcounts") print(plotScoreHeatmap(pred.mouseImmu_M,show_colnames =TRUE)) load("E:/project/celldex数据库/ImmGenData.Rdata") pred.mouseImmu_I <- SingleR(test = PBMC.all_for_SingleR, ref = ImmGenData, labels = ImmGenData$label.main, method = "cluster", clusters = clusters, assay.type.test = "logcounts", assay.type.ref = "logcounts") print(plotScoreHeatmap(pred.mouseImmu_I,show_colnames =TRUE)) #获取预测结果 cellType=data.frame(ClusterID=levels(PBMC.all@meta.data$seurat_clusters), ref.se=pred.mouseImmu_M$labels) #可通过 fix(cellType) 对亚群名称进行修改 #将预测到的细胞亚群传给原始数据集 new.cluster.ids <- cellType$ref.se names(new.cluster.ids) <- levels(PBMC.all) PBMC.all <- RenameIdents(PBMC.all, new.cluster.ids) DimPlot(PBMC.all, reduction = 'umap', label = TRUE, pt.size = 0.5) ###若分群不明显，可将所有细胞分为肿瘤细胞和免疫细胞，对免疫细胞重新进行分析，也可通过marker基因的方式，人工确定亚群名称，也可结合数据库预测。 saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all4.rds")

查看全文