今天,我们关注的数据集是标题为Impact of ALK inhibition or phospho-deficient mutation of CDK9 on gene expression in ovarian cancer cells的数据集,目前该文章尚处于未发表状态。小伙伴们,我们可以一起在学习转录组数据分析的同时,猜猜作者的整体研究思路,顺便补充一下背景知识,增强自己对于课题设计的能力,我就抛砖引玉,在此描述自身的一些看法。
首先咱们先补充下文章所涉及的三个关键对象(CDK9、ALK与卵巢癌细胞系)的背景知识
CDK9是细胞周期蛋白依赖性激酶家族的一员,与细胞增殖、分化与凋亡有关。目前,CDK9抑制剂在肿瘤药物开发市场上大行其道,大量的肿瘤药物如AZD-4573 、VIP-152 、 GFH-009等都是基于CDK9抑制剂开发的。有研究表明CDK9是正性转录因子P-TEFb的催化亚基,催化P-TEFb后,能促进基因转录。然而,抑制CDK19,会抑制P-TEFb的激活,P-TEFb失活后会导致RNAPoly-IIC末端区域的磷酸化被阻断,进而抑制转录,促进肿瘤细胞凋亡。ALK是酪氨酸激酶的一种,主要在癌症组织中表达,在正常人中不表达,ALK靶点可以说是目前针对癌症的最热门靶点之一,临床也存在大量的ALK抑制剂。作者所用的卵巢癌细胞系是SKOV3细胞系,它是人卵巢癌在细胞水平研究的经典细胞系。
介绍完了背景,那就让我们猜测一下作者的课题设计思路吧
基于上述研究背景介绍以及结合作者的标题,作者应该是想看CDK9具体的磷酸化位点Y19F突变能否抑制CDK19发挥抑制转录,促进肿瘤细胞凋亡的作用。此外,作者应该还想探究经典抑制剂ALK能否促进卵巢癌细胞细胞凋亡,以及ALK与CDK9是否存在通路上的关联。
介绍了背景,描述了课题思路猜测,接下来咱们言归正传,开始转录组分析吧。分析之前,我们先看一下该转录组数据集的组成,可以发现其为3分组,每个分组具有3个样本,如下所示:
GSM5706092 WT_01
GSM5706093 WT_02
GSM5706094 WT_03
GSM5706095 Ai_01
GSM5706096 Ai_02
GSM5706097 Ai_03
GSM5706098 Y19F_01
GSM5706099 Y19F_02
GSM5706100 Y19F_03
数据集地址如下:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE189695
处理数据的话,作者上传了可以做差异分析的count矩阵。
数据集下载链接为:https://ftp.ncbi.nlm.nih.gov/geo/series/GSE189nnn/GSE189695/suppl/GSE189695_Gene_expression_count.xls.gz (请自行下载)
此次转录组分析的目的是探究AKT抑制剂处理组(Ai)以及CDK9的Y19F突变处理组相较于正常组WT的基因表达差异情况,以及两种处理产生的差异基因的交集情况,探究两种处理方式对于卵巢癌细胞的影响。
rm(list = ls())
library(edgeR)
library(clusterProfiler)
library(DESeq2)
library(FactoMineR)
library(factoextra)
library(org.Hs.eg.db)
library(stringr)
library(stringi)
library(tidyverse)
library(ggplot2)
library(patchwork)
library(pheatmap)
library(VennDiagram)
library(RColorBrewer)
rawcount <- read.delim("./GSE189695_Gene_expression_count.xls",header = T,sep="\t")
rawcount <- rawcount[,1:10]
table(!duplicated(rawcount[,1]))
##
## TRUE
## 58735
rownames(rawcount) <- rawcount[,1]
rawcount <- rawcount[,-1]
##转换ID
id2symbol <- bitr(rownames(rawcount),
fromType = "ENSEMBL",
toType = "SYMBOL",
OrgDb = org.Hs.eg.db)
rawcount <- cbind(GeneID=rownames(rawcount),rawcount) #多一列换成基因ID
rawcount <- merge(id2symbol,rawcount,
by.x="ENSEMBL",by.y="GeneID",all.y=T)
rawcount=rawcount[!duplicated(rawcount$SYMBOL),]
rawcount <- drop_na(rawcount) #去除含有NA值的行
rawcount <- rawcount[,!colnames(rawcount)%in%c("ENSEMBL","Ensmble","median")]#正式获得基因rawcount矩阵
keep <- rowSums(rawcount>0) >= floor(0.5*ncol(rawcount))
filter_count <- rawcount[keep,] #获得filter_count矩阵
rownames(filter_count) <- filter_count$SYMBOL
filter_count <- filter_count[,!colnames(filter_count)%in%"SYMBOL"]
express_cpm <- log2(cpm(filter_count)+ 1)
express_cpm[1:4,1:4] #获得cpm矩阵
## WT_01 WT_02 WT_03 Ai_01
## TSPAN6 5.442611 5.314416 5.535526 5.570077
## DPM1 5.852307 5.950811 5.955589 5.789793
## SCYL3 3.587102 3.290299 2.872606 3.344491
## C1orf112 3.844405 3.809197 3.695648 3.781732
save(express_cpm,filter_count,file="Step03filterGeneCountCpm.Rdata")
#比较一:WT vs AI
filter_count1 <- filter_count[,c(1,2,3,4,5,6)]
express_cpm1 <- express_cpm[,c(1,2,3,4,5,6)]
colnames(filter_count1) <- c("WT1","WT2",'WT3',"Ai1","Ai2",'Ai3')#给样本命名
colnames(express_cpm1) <- colnames(filter_count1)
group1=rep(c("WT","Ai"),each=3)
group_list1=factor(group1,levels = c("WT","Ai"))
table(group_list1)#检查一下组别数量
## group_list1
## WT Ai
## 3 3
#比较二:WT vs Y19F
load("Step03filterGeneCountCpm.Rdata")
filter_count2 <- filter_count[,c(1,2,3,7,8,9)]
express_cpm2 <- express_cpm[,c(1,2,3,7,8,9)]
colnames(filter_count2) <- c("WT1","WT2",'WT3',"Y19F1","Y19F2",'Y19F3')#给样本命名
colnames(express_cpm2) <- colnames(filter_count2)
group2=rep(c("WT","Y19F"),each=3)
group_list2=factor(group2,levels = c("WT","Y19F"))
table(group_list2)#检查一下组别数量
## group_list2
## WT Y19F
## 3 3
## 01分别绘制两组的箱线图并且拼接起来
exprSet1 <- express_cpm1
data1 <- data.frame(expression=c(exprSet1),
sample=rep(colnames(exprSet1),each=nrow(exprSet1)))
p1.1 <- ggplot(data = data1,aes(x=sample,y=expression,fill=sample))
p1.1 <- p1.1 + geom_boxplot() + theme(axis.text.x = element_text(angle = 90)) +
xlab(NULL) + ylab("log2(CPM+1)")+theme_bw()
exprSet2 <- express_cpm2
data2 <- data.frame(expression=c(exprSet2),
sample=rep(colnames(exprSet2),each=nrow(exprSet2)))
p1.2 <- ggplot(data = data2,aes(x=sample,y=expression,fill=sample))
p1.2 <- p1.2 + geom_boxplot() + theme(axis.text.x = element_text(angle = 90)) +
xlab(NULL) + ylab("log2(CPM+1)")+theme_bw()
p1.1+p1.2
## 02分别绘制两组比较的PCA图并且拼接起来
## 绘制第一组比较的PCA图
dat1 <- express_cpm1
dat1 <- as.data.frame(t(dat1))
dat1 <- na.omit(dat1)
dat1$group_list1 <- group_list1
dat_pca1 <- PCA(dat1[,-ncol(dat1)], graph = FALSE)#画图仅需数值型数据,去掉最后一列的分组信息
p2.1 <- fviz_pca_ind(dat_pca1,
geom.ind = "point", # 只显示点,不显示文字
col.ind = dat1$group_list1, # 用不同颜色表示分组
palette = c("#00AFBB", "#E7B800"),
addEllipses = T, # 是否圈起来,少于4个样圈不起来
legend.title = "Groups") + theme_bw()
# 绘制第二组比较的PCA图
dat2 <- express_cpm2
dat2 <- as.data.frame(t(dat2))
dat2 <- na.omit(dat2)
dat2$group_list2 <- group_list2
dat_pca2 <- PCA(dat2[,-ncol(dat2)], graph = FALSE)
p2.2 <- fviz_pca_ind(dat_pca2,
geom.ind = "point", # 只显示点,不显示文字
col.ind = dat2$group_list2, # 用不同颜色表示分组
palette = c("#00AFBB", "#E7B800"),
addEllipses = T, # 是否圈起来,少于4个样圈不起来
legend.title = "Groups") + theme_bw()
p2.1+p2.2
# 01对第一组比较进行差异分析
exprSet1 <- filter_count1
design1 <- model.matrix(~0+rev(factor(group_list1)))
rownames(design1) <- colnames(exprSet1)
colnames(design1) <- levels(factor(group_list1))
DEG1 <- DGEList(counts=exprSet1, #构建edgeR的DGEList对象
group=factor(group_list1))
DEG1$samples$lib.size
## [1] 19604619 19127530 18817593 20856939 18062388 19088691
DEG1 <- calcNormFactors(DEG1)
DEG1$samples$norm.factors
## [1] 0.9929908 0.9894818 0.9929961 1.0075946 1.0071244 1.0100211
# 计算线性模型的参数
DEG1 <- estimateGLMCommonDisp(DEG1,design1)
DEG1 <- estimateGLMTrendedDisp(DEG1, design1)
DEG1 <- estimateGLMTagwiseDisp(DEG1, design1)
# 拟合线性模型
fit1 <- glmFit(DEG1, design1)
lrt1 <- glmLRT(fit1, contrast=c(1,-1))
# 提取过滤差异分析结果
DEG_edgeR1 <- as.data.frame(topTags(lrt1, n=nrow(DEG1)))
head(DEG_edgeR1)
## logFC logCPM LR PValue FDR
## ERBB2 0.5464030 11.114264 418.0616 6.447066e-93 1.110443e-88
## THBS1 -0.5977559 12.334458 377.1758 5.126754e-84 4.415161e-80
## ANPEP 0.9537092 7.380956 301.6121 1.467445e-67 8.425093e-64
## IGFBP3 -0.4921004 10.774577 255.3659 1.756578e-57 7.563823e-54
## SLC7A5 -0.4675588 11.301005 227.4981 2.093961e-51 7.213278e-48
## FSTL1 -0.4178526 10.815360 222.3786 2.738836e-50 7.862287e-47
# 设定阈值,筛选显著上下调差异基因
fc <- 2.0
p <- 0.05
DEG_edgeR1$regulated <- ifelse(DEG_edgeR1$logFC>log2(fc)&DEG_edgeR1$PValue<p,
"up",ifelse(DEG_edgeR1$logFC<(-log2(fc))&DEG_edgeR1$PValue<p,"down","normal"))
table(DEG_edgeR1$regulated)
##
## down normal up
## 189 16855 180
# 02对第二组比较进行差异分析
exprSet2 <- filter_count2
design2 <- model.matrix(~0+rev(factor(group_list2)))
rownames(design2) <- colnames(exprSet2)
colnames(design2) <- levels(factor(group_list2))
DEG2 <- DGEList(counts=exprSet2, #构建edgeR的DGEList对象
group=factor(group_list2))
DEG2$samples$lib.size
## [1] 19604619 19127530 18817593 29120598 27715691 26280833
DEG2 <- calcNormFactors(DEG2)
DEG2$samples$norm.factors
## [1] 0.9923097 0.9911396 0.9924724 1.0074705 1.0103825 1.0064247
# 计算线性模型的参数
DEG2 <- estimateGLMCommonDisp(DEG2,design2)
DEG2 <- estimateGLMTrendedDisp(DEG2, design2)
DEG2 <- estimateGLMTagwiseDisp(DEG2, design2)
# 拟合线性模型
fit2 <- glmFit(DEG2, design2)
lrt2 <- glmLRT(fit2, contrast=c(2,-2))
# 提取过滤差异分析结果
DEG_edgeR2 <- as.data.frame(topTags(lrt2, n=nrow(DEG2)))
head(DEG_edgeR2)
## logFC logCPM LR PValue FDR
## NUP210 -3.443919 6.825185 937.0344 8.732958e-206 1.504165e-201
## CDH13 -6.936306 4.477537 780.0703 1.161999e-171 1.000714e-167
## ZNF542P -2.188561 7.452704 406.4660 2.154756e-90 1.237117e-86
## THBS1 -1.396453 12.295024 401.3698 2.771706e-89 1.193497e-85
## INHBA 2.173195 6.918936 399.6435 6.584726e-89 2.268306e-85
## EFEMP1 2.730098 6.092518 392.1923 2.758172e-87 7.917793e-84
# 设定阈值,筛选显著上下调差异基因
fc <- 2.0
p <- 0.05
DEG_edgeR2$regulated <- ifelse(DEG_edgeR2$logFC>log2(fc)&DEG_edgeR2$PValue<p,
"up",ifelse(DEG_edgeR2$logFC<(-log2(fc))&DEG_edgeR2$PValue<p,"down","normal"))
table(DEG_edgeR2$regulated)
##
## down normal up
## 705 15855 664
## 01火山图的绘制及拼图
## 绘制第一组比较的火山图
data1 <- DEG_edgeR1
p3.1 <- ggplot(data=data1, aes(x=logFC, y=-log10(PValue),color=regulated)) +
geom_point(alpha=0.5, size=1.8) +
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2FC") + ylab("-log10(Pvalue)") +
scale_colour_manual(values = c('blue','black','red'))+theme_bw()
## 绘制第二组比较的火山图
data2 <- DEG_edgeR2
p3.2 <- ggplot(data=data2, aes(x=logFC, y=-log10(PValue),color=regulated)) +
geom_point(alpha=0.5, size=1.8) +
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2FC") + ylab("-log10(Pvalue)") +
scale_colour_manual(values = c('blue','black','red'))+theme_bw()
## 拼图
p3.1+p3.2
## 02差异基因热图的绘制及拼图
## 绘制第一组比较的差异基因热图
edgeR1_sigGene1 <- DEG_edgeR1[DEG_edgeR1$regulated!="normal",]
edgeR1_sigGene1 <-rownames(edgeR1_sigGene1)
dat1 <- express_cpm1[match(edgeR1_sigGene1,rownames(express_cpm1)),]
dat1[1:4,1:4]
## WT1 WT2 WT3 Ai1
## IGFN1 7.054281 6.784852 7.007268 5.581679
## PPP1R1B 5.645877 5.502522 5.768548 6.694265
## TNC 5.111178 5.087000 5.053281 6.251318
## EPAS1 6.496905 6.592708 6.482653 5.724574
group1 <- data.frame(group=group_list1)
rownames(group1)=colnames(dat1)
library(pheatmap)
p4.1 <- pheatmap(dat1,scale = "row",show_colnames =T,show_rownames = F,
cluster_cols = T,
annotation_col=group1,
main = "edgeR's DEG")
## 绘制第二组比较的差异基因热图
edgeR1_sigGene2 <- DEG_edgeR2[DEG_edgeR2$regulated!="normal",]
edgeR1_sigGene2 <-rownames(edgeR1_sigGene2)
dat2 <- express_cpm2[match(edgeR1_sigGene2,rownames(express_cpm2)),]
dat2[1:4,1:4]
## WT1 WT2 WT3 Y19F1
## NUP210 7.412340 7.484784 7.440102 5.811213
## CDH13 5.467772 5.357366 5.425660 2.266557
## ZNF542P 7.822119 7.943053 7.918534 6.852903
## THBS1 12.611673 12.594574 12.566268 11.950566
group2 <- data.frame(group=group_list2)
rownames(group2)=colnames(dat2)
p4.2 <- pheatmap(dat2,scale = "row",show_colnames =T,show_rownames = F,
cluster_cols = T,
annotation_col=group2,
main = "edgeR's DEG")
## 热图拼图
library(ggplotify)
p4.1 <- as.ggplot(p4.1)
p4.2 <- as.ggplot(p4.2)
p4.1+p4.2
# 01先获取第一组比较的上调基因,下调基因与差异基因
deg1 <- rownames(DEG_edgeR1[DEG_edgeR1$regulated!="normal",])
up1 <- rownames(DEG_edgeR1[DEG_edgeR1$regulated=="up",])
down1 <- rownames(DEG_edgeR1[DEG_edgeR1$regulated=="down",])
# 02再获取第二组比较的上调基因,下调基因与差异基因
deg2 <- rownames(DEG_edgeR2[DEG_edgeR2$regulated!="normal",])
up2 <- rownames(DEG_edgeR2[DEG_edgeR2$regulated=="up",])
down2 <- rownames(DEG_edgeR2[DEG_edgeR2$regulated=="down",])
# 03绘制两组差异比较的韦恩图交集情况
library(ggvenn)
# 绘制两组比较整体差异基因的韦恩图
deg<-list(`deg1`=deg1,
`deg2`=deg2)
p5 <- ggvenn(deg,
show_percentage = F,
stroke_color = "white",
fill_color = c("#b2d4ec","#d3c0e2"),
set_name_color = c("#ff0000","#4a9b83"))
p5
# 绘制两组比较上调差异基因韦恩图
up<-list(`up1`=up1,
`up2`=up2)
p6 <- ggvenn(up,
show_percentage = F,
stroke_color = "white",
fill_color = c("#ffb2b2","#b2e7cb"),
set_name_color = c("#ff0000","#4a9b83"))
p6
# 绘制两组下调差异基因韦恩图
down<-list(`down1`=down1,
`down2`=down2)
p7 <- ggvenn(down,
show_percentage = F,
stroke_color = "white",
fill_color = c("#b2d4ec","08a4ad"),
set_name_color = c("#ff0000","#4a9b83"))
p7
# 三幅韦恩图拼图
p5 +p6 + p7
以上就是转录组三分组常见的差异分析方式,大家可以来试试额。上述代码复制即可实现,若自身想进行数据挖掘分析以及自身转录组下游数据处理,可以尝试尝试这种方式额。
❝背景补充部分重点参考文献
Cassandri M, Fioravanti R, Pomella S, Valente S, Rotili D, Del Baldo G, De Angelis B, Rota R, Mai A. CDK9 as a Valuable Target in Cancer: From Natural Compounds Inhibitors to Current Treatment in Pediatric Soft Tissue Sarcomas. Front Pharmacol. 2020 Aug 13;11:1230. doi: 10.3389/fphar.2020.01230. PMID: 32903585; PMCID: PMC7438590.
❞