好用的R小技巧

我觉得 R 是一种很神奇的语言,用好了的话事半功倍,但其间要会有很多很多坑,希望大家多多注意不要泥足深陷。

  1. 时刻注意数据类型,很多报错是因为类型不符而产生的 chr or int ?factor 也是一个很难缠的数据类型(但省位子)
  2. 经常plot一下你的处理后数据,查看是否有异常。 很简单的一步,chr 变 numeric,需要as.numeric(as.character(x)) ,这样数据才不会出错。

==================读入文件================

dat <- as.data.frame(read.table("input.csv",header = TRUE,sep = ",", dec = ".",na.strings = "NA",stringsAsFactors=FALSE,check.names = FALSE))
row.names(dat) <- dat[,1]
dat<-dat[,-1]
dat[1:5,]

na.strings = "NA"控制缺失值
stringsAsFactors=FALSE,让每列的数据该是什么类型就是什么类型,全列数字的才自动成intdbl ,
check.names = FALSE,有行名如果是纯数字开头,会自动变成X开头,这个对后面的匹配很不友好,所以可以加这一句,避免变X。

==================写出文件================

#csv & txt
y_name=gsub("/", "&", y_name)
write.csv(d,file = paste('dat/',y_name,'.csv',sep=""),quote=F,row.names=F)

#png
png(file="metabolites_distribution_among_cancertype.png",width=2000,height=1000)
#content...
dev.off()

#pdf
pdf(file="metabolites_distribution_among_cancertype.pdf")
#content...
dev.off()

============ 删除带有的NA行并匹配两表格 ============

#以下两种方法需要table1 和 table2 的行名是按顺序一一对应的,
#先获取table1中NA的行号
dim(table1)
non_NA <- !is.na(table1[,1])
table1<-table1[non_NA,]     #(去NA行方法1)
summary(non_NA)

#如果有另一个table2 与 table1 对应,需要同样去除NA的行
table2<-as.data.frame(table2[non_NA,])
dim(table2)

#同理,也可以先获取NA的行号,使用时直接调用
non_NA <- !is.na(table2[,1])
function(data.x=table1[non_NA,], data.y=table2[non_NA,]...)
#如果table1 和table2 的行名顺序是不一样的,需要用merge
#先获取去除table1d NA行(更新table1)
table1<-table1[complete.cases(table1),]  #(去NA行方法2)
#table1<-na.omit(table1) #(去NA行方法3)
dat_rmNA<-merge(table1,table2,by="row.names",all.x=TRUE)

================删除全部为NA的行/列===============

# remove all NA row/column
drop_Nas <- function (data,dim){
     if (dim == 2){
         na_flag <- apply(!is.na(data),2,sum) # use ! to inverse 1 (to 0 0) and 0 (to 1).
         data <- data[,-which(na_flag == 0)]
     }
     else if (dim == 1){
         na_flag <- apply(!is.na(data),1,sum)
         data <- data[-which(na_flag == 0),]
     }
     else{
         warning("dim can only equal to 1 and 2, 1 means row, 2 means column ")
     }
     return(data)
 }

dim(tumor)
drop_Nas(dat,1)  #1 means row, 2 means column
删除前
删除后

================impute with 最小值================

#check missing value and impute with min value
sum(is.na(data))

library(Hmisc)
impute_min=function(x) impute(x,min)
new=as.data.frame(apply(data,1,impute_min))   #1是行最小值,2是列最小值,记得要是numeric
head(data) #before impute
head(new)  #after impute
data=new
sum(is.na(data))
data[1:3,]
head(data)
head(new)

==================data.frame 格式改变 ===============

#一整个data.frame(需确认全是数字)的转换
dat_n=as.data.frame(lapply(dat[,-1],as.numeric))

#字符串型label转变为numeric类型
table(dat[,1])
a <- sub("F",2,data_pca[,1])
b <- sub("M",1,a)
dat[,1] <- a                         
dat[,1] <- b
dat<-as.data.frame(data_pca)

#一整个data.frame的factor变为numeric  #is.factor也可以换成 is.character
dat_n[] <- lapply(dat, function(x) {
    if(is.factor(x)) as.numeric(as.character(x)) else x
}) 

==================统计每行/列有多少个0================

dat=as.data.frame(dat)
#dat[1:5,]
f<-function(x) sum(x==0)
#按列统计是2,按行统计是1
d<-as.data.frame(apply(dat,2,f))
d

#如果数据中心还有NA,则整一个会显示为NA

#画图直接显示多少个NA
barplot(colSums(is.na(data)),
        ylab = "# missing values",
        xlab = "Sample")

==================统计缺失值================

#统计总缺失值个数
sum(is.na(data))

#按列统计缺失值
sapply(data, function(x) sum(is.na(x)))

#统计缺失率,按列为2,按行为1
miss <- function(x){sum(is.na(x))/length(x)}
apply(data,2,miss)

============ jupyter notebook 中 python 结果显示多行==========

from IPython.core.interactiveshell import InteractiveShell        
InteractiveShell.ast_node_interactivity = "all"

================初始化list或者array===========

#array
#dim=c(2,3,4),创建4个矩形矩阵,每个矩阵2行3列。
result <- array(NA,dim=c(dim(X)[2], 4, dim(Y)[2]))
rownames(result) <- colnames(X)
colnames(result) <- c("p","q","z","est")

#list
p_result <-rep(NA,length=dim(X)[2])
p_result[i] <- ...

#生成重复序列
rep(c("1","2","3"),times=c(5,42,34))

================移除/提取特定行和列============

#移除行列
dat<-dat[!rownames(dat) %in% c("12691.PC30","12691.PC10"),]
dat<-dat[,!colnames(dat) %in% c("specimen_id")]
#提取特定行列 需要注意,这个方法如果没有这个列名/行名的话,会产生NA,首选推荐使用subset
dat<-dat[,c("specimen_id","disease_type_consol")]
dat<-dat[c("12691.PC30","12691.PC10"),]

#多个条件提取行/指定信息
subset(dat,host_age<=50&HvsC=="Cancer")
subset(dat,host_age<=50&HvsC=="Cancer",select=specimen_id)
#这个方法有个bug,NA的数据也会一并读入,且整一行都会变成NA,所以用完这个之后,还要去除NA行

在eset_f中,只提取dif表中有的行,并存储成selected 表

dif <- dif[dif[, "P.Value"]<0.01,]
dim(dif)
#head(dif)

library(pheatmap)
selected <- eset_f[rownames(dif), ]

================列表删除特定元素============

list<-list[list != "SMP0000715"]

=============两个直方图结合在一起============

#par(mfrow=c(2,2))
p1=hist(Ct,breaks = 100)
p2=hist(Cn,breaks = 100)

plot(p1,col=rgb(0,0,1,1/4))    #xlim=c(-1,1)
plot(p2,col=rgb(1,1,0,1/4), add=T)  # second
abline(v=0,col="red")

==============数据标准化+年龄&性别==========

一般数据要做标准化noramlization,保证不同批次、不同样本间有可比较性,一般再用PCA检查一下还有没有批次效应batch effect 。voom的操作中有log处理,相当于scaling,数据范围缩小。

四分位标准化quantile normalizations

四分位标准化原理:样本内部按特征的大小排序(上),然后横向取同一rank的所有数据,求平均值并用平均值取代该rank的所有数据(下)。qnorm能有效降低离群异常值的影响。

#read in data
qcMetadata <- as.data.frame(read.csv("sampleData.csv", header = TRUE, sep = ",", dec = ".",stringsAsFactors=FALSE))
row.names(qcMetadata) <- qcMetadata[,1]
qcMetadata<-qcMetadata[,-1]
#samples in row and features in colname
qcMetadata[1:5,]

qcData <- as.data.frame(read.csv("Raw-Data.csv", header = TRUE, sep = ",", dec = ".",stringsAsFactors=FALSE))
row.names(qcData) <- qcData[,1]
qcData<-qcData[,-1]
qcData[1:5,]

## Load packages ##
  library(limma)
  library(edgeR)
  library(dplyr)
  library(snm)
  library(doMC)
  library(tibble)
  library(gbm)
  
  numCores <- detectCores()
  registerDoMC(cores=numCores)
  
  # Set up design matrix
  covDesignNorm <- model.matrix(~0 + disease_type_consol +
                                    host_age + # host_age should be numeric
                                    sex, # sex should be a factor
                                  data = qcMetadata)
  
  # Check row dimensions
  dim(covDesignNorm)[1] == dim(qcData)[1]
  
  #print(colnames(covDesignNorm))
  # The following corrects for column names that are incompatible with downstream processing
  colnames(covDesignNorm) <- gsub('([[:punct:]])|\\s+','',colnames(covDesignNorm))
  #print(colnames(covDesignNorm))
  
  # Set up counts matrix
  counts <- t(qcData) # DGEList object from a table of counts (rows=features, columns=samples)
  
  # Quantile normalize and plug into voom
  dge <- DGEList(counts = counts)
  vdge <- voom(dge, design = covDesignNorm, plot = TRUE, save.plot = TRUE, 
                                              normalize.method="quantile")
  
  # List biological and normalization variables in model matrices
  bio.var <- model.matrix(~disease_type_consol,
                          data=qcMetadata)
  
  adj.var <- model.matrix(~host_age + sex,
                          data=qcMetadata)
  
  colnames(bio.var) <- gsub('([[:punct:]])|\\s+','',colnames(bio.var))
  colnames(adj.var) <- gsub('([[:punct:]])|\\s+','',colnames(adj.var))
  #print(dim(adj.var))
  #print(dim(bio.var))
  #print(dim(t(vdge$E)))
  #print(dim(covDesignNorm))
  
  snmDataObjOnly <- snm(raw.dat = vdge$E, 
                        bio.var = bio.var, 
                        adj.var = adj.var, 
                        rm.adj=TRUE,
                        verbose = TRUE,
                        diagnose = TRUE)
  snmData <- as.data.frame(t(snmDataObjOnly$norm.dat))
  snmData[1:5,]
这里选用了quantile norm方法,样本数据(上),原始数据(中),标准化后数据(下)
一个点代表一个样本,这些点要与红线轨迹比较接近,才证明这个voom的数据可用

boxplot初步看数据有没有标准化

exp=t(qcData) #原始数据
#for exp data format: samples in colname and features in row
exp[1:2,]
qnexp=t(snmData) #标准化后数据
qnexp[1:2,]

par(mfcol=c(3,1))
boxplot(exp[,sample(1:ncol(exp), 30)], main="raw read count distribution by sample")
boxplot(log2(1+exp[,sample(1:ncol(exp), 30)]), main="loged(scale) read count distribution by sample")
boxplot(qnexp[,sample(1:ncol(qnexp), 30)], main="quantile norm read count distribution by sample")

第二个boxplot, log时+1是为了避免原始数据有大量0,随机从数据集中选30个样本来比较。

============boxpot/散点图分颜色 =============

#normal distribution vs tumor distribution
library(dplyr)
library(ggplot2)
library(ggpubr)
#tmp %>% ggplot(aes(MNAgroup,methy_rate,fill=MNAgroup))+geom_boxplot()+geom_jitter(alpha=0.5)

compare_means(methy_rate ~ MNAgroup, data = tmp)
p <- ggboxplot(tmp, x = "MNAgroup", y = "methy_rate", palette = "jco",color="MNAgroup",add = "jitter")
#  Add p-value
p + stat_compare_means()
# Change method
#p + stat_compare_means(method = "t.test")


============Bootstrap自助法==========

library(broom)
library(glmnet)

indices=1
    y<-as.numeric(as.character(metabolites[,indices]))
    x1<-as.numeric(as.character(metagenomics[,indices]))
    x2<-as.numeric(as.character(data[,1]))
    x3<-as.numeric(as.character(data[,2]))
rsq <- function(formula,data, indices) {
    fit <- lm(formula, data=as.data.frame(data))
    pvalue<-tidy(summary(fit))$p.value[2]
    ar2<-summary(fit)$adj.r.square
    re=c(ar2,pvalue)
    return(re)
} 	

library(boot)
set.seed(1234)
results <- boot(data=data, statistic=rsq, 
                R=1000, formula=y~ x1+x2+x3)
print(results)
#plot(results)
#boot.ci(results, type=c("perc", "bca"))
有浮动的数据 bias 和 std.error 不为0

Broom 用来提取模型结果,其他相关资料:


==================缺失值处理/impute================

#方法一 最小值填充
library(Hmisc)
impute_min=function(x) impute(x,min)
new=as.data.frame(apply(data,2,impute_min)) #2是按列填充,1是按行填充,data是输入数据

#多重填补 MICE包
library(mice)
library(mice)
imputed=mice(data,m=5,meth="pmm",seed=1007)
head(complete(imputed))

============correlation analysis==============

#带NA的做不了correlation analysis,所以要先impute。
# 千万注意是要看样本还是特征的相关度,在这里是指同一个data frame的的非重复两两比较

head(dat_tmp)
#COORELATION
cor <- 0
d<-c("0","0","0")
end=ncol(dat_tmp)-1
for (i in 1:end) {
    x = dat_tmp[,i]
    s=i+1
    for (j in s:ncol(dat_tmp)) {     #这里以列列比较为例
        cor_n <- cor(x, y = dat_tmp[,j], method = "spearman")  #spearman or pearson
        cor<-append(cor,cor_n)
        if(as.numeric(as.character(cor_n))>0.9 ){   #按需设定correlation, 输出在d,输出内容越多,运行越慢,如果想全输出,可以略去if这一步
            n1<-colnames(dat_tmp)[i]
            n2<-colnames(dat_tmp)[j]
            re<-c(n1,n2,cor_n)
            d<-rbind(d,re)
        }
    }
}

summary(cor[-1])
hist(cor[-1],breaks=100,xlim=c(-1,1))  #correlation distribution,看相关度大概率落在哪个范围

d<-d[order(d[-1,3],decreasing=TRUE),]
head(d)
head(dat_tmp)
head(d)

============Permutation Test置换检验==========

Permutation Test:

data<-as.data.frame(read.table("data/metabolomics_metagenomics_34JC.csv",sep=",",header=T,row.names=1,stringsAsFactors=F))
data<-t(data[-c(1,2),])
metagenomics<-data[,1:169]
metabolites<-data[,170:295]

#shuffles label in one group
j = sample(1:nrow(metabolites),1*nrow(metabolites)) 
#metabolites = metabolites[j,] 
#metagenomics = metagenomics[j,]

#metagenomics[1:2,]
#metabolites[1:2,]

cor <- numeric(dim(metagenomics)[2])
for (i in 1:dim(metagenomics)[2]) {
cor[i] <- cor(x = metabolites[,1], y = metagenomics[,i], use = "everything", method = "spearman")
}
#med
hist(cor,breaks=100)
#inter <- quantile(med, c(0.025,0.975))
#inter

应用说明见下文最后:

==================数组排序================

d<-d[order(d[,2],decreasing=TRUE),]

================根据两列数据统计数据且画直方图=========

#calculate sample type frequence in each cancer type
library(dplyr)
summaryData<-as.data.frame(group_by(dat,ONCOTREE_CODE,SAMPLE_TYPE) %>% summarise(.,count=n()))
summaryData
head(dat)
head(summaryData)
#bar plot
par(mfrow=c(1,2))
ggplot(summaryData,aes(x=ONCOTREE_CODE, y=count, fill=SAMPLE_TYPE))+geom_bar(stat='identity')
ggplot(summaryData,aes(x=ONCOTREE_CODE, y=count, fill=SAMPLE_TYPE))+geom_bar(stat='identity',position="fill")
#identity means use y value as input.add 'position="dodge"' means two bar plot but not add together

===========查看a、b两个数据集合的共同部分===========

table(rownames(count_df) %in% hsk_gene_df$gene_id)
#提取共有部分
hsk_gene_id.count <- count_df[rownames(count_df) %in% hsk_gene_df$gene_id,]
从上到下为 count_df 和 hsk_gene_df

=======a表的行名/列名 按b表的行名/列名排序=============

eset_f[1:2,]
info=info[sort(colnames(eset_f)),]
info[1:2,]

==================循环函数================

#求两列数的相关系数
#如果数列里有NA,那么计算结果也会直接变成NA

cor<-lapply(1:length(list),function(x){
    
    a=dat[,list[x]]
    b=dat[,paste("RNA_",list[x], sep = "")]
    pcor = cor(a,b,method="pearson")
    re<-c(list[x],pcor)
    return(re)
    })  

d<-do.call(rbind, cor)
d<-as.data.frame(d)
d<-d[order(d[,2],decreasing=TRUE),]
colnames(d)<-c("pethway","correlation")
d[1:5,]

=================按列/行求和======================

#列
apply(eset_f,2,sum)
colSums(eset_f)

#行
rowSums(eset_f)
apply(eset_f,2,sum)

#直接加入表中
#info$ExpSum=apply(eset_f,2,sum)

===============按特定列分组/聚集 group_by==================

library(dplyr)
#除了分组列,其他每一组都计算
gdat = as.data.frame(gdat %>% group_by(tcga_code) %>% summarise_each(funs(mean)))
#如果只想按分组并计算某一列“target_col"
#gdat %>% group_by(tcga_code) %>% summarise(n=mean(2-aminoadipate))
row.names(gdat)=gdat[,1]
gdat=gdat[,-1]
head(gdat)
head(gdat) 根据tcga_code分组,同一组的数据取平均值
output ,每一行变成了一种肿瘤

=================Boxplot related======================

#boxplot 加点加pvalue
library(ggplot2)
library(ggpubr)

compare_means(relapse~score, data, method="wilcox.test", paired=FALSE)
p <- ggboxplot(info, x="relapse",
               y = "score", color = "relapse",
               palette = "jco", add = "jitter")
# add p-value
p + stat_compare_means()

table(info$relapse)
#多变量分组
library(ggplot)
ggplot(dd, aes(type, riskScore, color=factor(category), fill=type)) + geom_boxplot()
head(dd)
#bean plot
ggplot(data, aes(x=group, y=total_concentration, color=group)) +
  geom_violin(trim=FALSE) +geom_boxplot(width=0.1)
bean plot 也叫小提琴图

=================火山图======================

exprSet=eset_f
group_list=group

library(edgeR)
dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
######### analysis
comp='gtMedian-ltMedian'  #after minor before
v <- voom(dge,group_list,plot=TRUE, normalize="quantile")
#v[1:4,1:10]
fit <- lmFit(v, group_list)
cont.matrix=makeContrasts(contrasts=c(comp),levels = group_list)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
#### get data
tempOutput = topTable(fit2, coef=comp, n=Inf)
DEG_limma_voom = na.omit(tempOutput)
head(DEG_limma_voom)
# logFCdone extract column 1 and 4
#save(DEG_limma_voom,file = "testset.limma.Rdata")
#write.csv(DEG_limma_voom,"testset.limma.csv")
#draw picture
library(ggplot2)
DEG=DEG_limma_voom
colnames(DEG)
plot(DEG$logFC,-log10(DEG$P.Value))
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs( logFC)) )
logFC_cutoff=1  #一般设1,即2倍表达量
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                              ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
table(DEG$change)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
g = ggplot(data=DEG, aes(,x=logFC, y=-log10(P.Value),color=change)) + geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+ xlab("log2 fold change") + ylab("-log10 p-value") + 
  ggtitle( this_tile ) +   theme(plot.title = element_text(size=15,hjust = 0.5)) + 
  scale_colour_manual(values = c('blue','black','red')) 	## corresponding to the levels(res$change)
print(g)
#ggsave(g,filename = 'volcano.pvalue.png')
group
eset_f 样本在列,基因在行

====================热图======================

常规办,读取数据,scale数据并画图。

info<-as.data.frame(read.table("data/TIMER.csv",header = TRUE,sep = ",", dec = ".",na.strings = "NA",stringsAsFactors=FALSE,check.names = FALSE))
row.names(info)=info[,1]
info=info[,-1]
#info<-info[!rownames(info) %in% c("T cell CD4+ (non-regulatory)_QUANTISEQ"),]
head(info)
dim(info)

#检查有无全0的数据,有则删除(用上面#注释那一个命令,按列名删除)
dat=as.data.frame(info)
#dat[1:5,]
f<-function(x) sum(x==0)
#"1":按行统计0个数
d<-as.data.frame(apply(dat,1,f))
d$ID=row.names(d)
d[order(d[,1],decreasing=TRUE),]

library(ComplexHeatmap)
library(pheatmap)
aa=t(info)
aa=scale(aa)
aa=t(aa)

#图1
Heatmap(aa,cluster_rows = TRUE,show_row_names = T)
#图2
p<-pheatmap(aa,show_rownames=T, cluster_cols=T, cluster_rows=T,cex=1, clustering_distance_rows="euclidean", cex=1,clustering_distance_cols="euclidean", clustering_method="complete", border_color=FALSE,cutree_col = 3)
p
colnames(aa[,p$tree_col[["order"]]]) #按图顺序从左到右获取样本名
rownames(aa[p$tree_row[["order"]],]) #按图顺序从上到下获取特征名
info,样本在列,特征(基因/细胞数量)在行
0,代表这一列有0个“零&amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;quot;/&amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;quot;NA”
library(ComplexHeatmap) 的 heatmap 作图
library(pheatmap) 作图,分了三组,这个包可设定的功能更多
提取样本分组信息

按分组提取样本

p<-pheatmap(data,scale="row",show_colnames=T, show_rownames=F,cluster_cols=T, cluster_rows=T,cex=1, clustering_distance_rows="euclidean", cex=1,clustering_distance_cols="euclidean", clustering_method="complete", border_color=FALSE,cutree_col = 3)
p
cluster = cutree(p$tree_col,k=3)
#table(cluster)
as.data.frame(cluster)

进阶版,选取差异基因画热图(这样看起来更好看),同时添加额外信息在图上一并展现。

library(ComplexHeatmap)
dif = DEG_limma_voom  # 同上,只提取差异表达的即引来画图,热图会更好看
dif <- dif[dif[, "P.Value"]<0.01,]
dim(dif)
#head(dif)

library(pheatmap)
selected <- eset_f[rownames(dif), ]
#head(selected)
rownames(selected) <- dif$symbols

annotation = data.frame(value = rnorm(80))
value = info[colnames(selected),]$Yield
yield = HeatmapAnnotation(df = annotation,points = anno_points(value), annotation_height = c(1, 2))
value = info[colnames(selected),]$RIN
rin = HeatmapAnnotation(points = anno_points(value))

aa=t(selected[1:dim(dif)[1],])
aa=scale(aa)
aa=t(aa)
#pheatmap(selected[1:dim(dif)[1],],top_annotation = yield,bottom_annotation = rin, fontsize_row = 1, scale = "row", border_color = NA)
Heatmap(aa, top_annotation = yield, bottom_annotation = rin,
        cluster_rows = TRUE,show_row_names = F)
#pheatmap(selected,scale = "row",border_color = NA) #这里可以指明按行or 按列做scale(在这里是让每个基因的数值做scale),所以也可以不置换。
dif(上)&amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp; selected(下)

scale是为了让数字集中一些,避免离群值对颜色的显示有影响,让颜色分层更加平缓。一般scale()中的数据是样本在行,特征在列,如果不是的就需要先转置一下做数据处理。

热图的特殊操作

  1. 如果有异常值(特别大或者特别小),可以限制上限下限值避免离群值影响颜色梯度
#接在scale(aa)和t(aa)之后
aa[aa>=3]=3
aa[aa<=-3]=-3


2. 计算分数并按中位数/ 按照某个数字分组

colnames(d)[1]="score"
cutoff=mean(d[,1])   #取中位数作为分隔值
cutoff
a=d[d[, "score"]>cutoff,]
b=d[d[, "score"]<=cutoff,]
a$group=2
b$group=1
score=rbind(a,b)
score=score[,-2]
table(score$group)
head(score)
根据score分组

=====================PCA=====================

cutoff=median(info[, "Yield"])
c=info[info[, "Yield"]>=cutoff,]
d=info[info[, "Yield"]<cutoff,]
c$label="Yield_GT"
d$label="Yield_LT"

label=rbind(c,d)
label=label[sort(colnames(eset_f)),]
dim(label)

eset_pca=t(eset_f)
eset_pca<-as.data.frame(eset_pca)
eset_pca=cbind(label[,"label"],eset_pca)
colnames(eset_pca)[1]="label"

eset_pca[1:2,]

dat=eset_pca

#install.packages('ggfortify')
library(ggfortify)

# apply PCA - scale. = TRUE is highly advisable, but default is FALSE. 
#par(mfcol=c(2,2))
out_pca <- prcomp(dat[,-1],scale= TRUE)
#plot(out_pca,type="l")
autoplot(out_pca,data=dat,colour='label',size=0.1,label=TRUE,label.size=3)
eset_pca(加了label)prcomp()的数据要求样本在行,特征在列

#一般都会做scale,让图好看一点

# 如果是考虑根据某个数值,使PCA图表现不同的颜色(同时展现PCA和样本数值大小3个维度),可以考虑下面这个方法
data=as.data.frame(cbind(out_pca$x[,1],out_pca$x[,2]))
data$yield= info[rownames(out_pca$x),]$Yield
data[1:5,]

ggplot(data) + geom_point(aes(V1, V2, colour = yield), size = 5)+labs(x = "pc_1",y = "pc_2") +theme_bw()+scale_color_gradientn(colours = rainbow(4))

=================PCA 与 PCoA==================


scale(),要求数据样本在行,基因在列
输出应该和PCA非常相似


=========== 表格逐一进行加减乘除计算并替换原表格===========

head(info)
#sweep(数据,1列/2行,运算的数字,"+","*","/","-")
info=sweep(data.matrix(info[,-1]),1,info$V2, FUN = "*")
head(info)

=========== 表格全部变为log形式===========

#log10 transfer
library(dplyr)
dat=as.data.frame(sweep(data.matrix(dat_rna),1,0.000001,FUN = "+"))   #log(X+0.000001)
dat_rna %>% mutate_each(funs(log10)) -> tmp
row.names(tmp)=row.names(dat_rna)
dat_rna=round(tmp,3)   #float size
tmp=""
#head(dat_rna)
dim(dat_rna)

=============boxplot/散点图加颜色加概率==============

head(tmp)

#normal distribution vs tumor distribution
library(dplyr)
library(ggplot2)
library(ggpubr)
#tmp %>% ggplot(aes(group,methy_rate,fill=group))+geom_boxplot()+geom_jitter(alpha=0.5)

compare_means(methy_rate ~ group, data = tmp)
p <- ggboxplot(tmp, x = "group", y = "methy_rate", palette = "jco",color="group",add = "jitter")
#  Add p-value
p + stat_compare_means()
# Change method
#p + stat_compare_means(method = "t.test")
head(tmp)
library(ggplot2)
library(dplyr)
head(dat_p)
dat_p %>% ggplot(data = dat_p, mapping = aes(x = x, y = y, colour = culture))+geom_smooth(method = lm)+geom_point()
head(dat_p)

============jupyter notebook中添加 R内核 =============

#R包大部分都无法下载,添加清华镜像
> options(repos=structure(c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))) 

#命令行界面中进入R
$ R
#查看当前有的镜像
> getOption("repos")
                                        CRAN
"https://mirrors.tuna.tsinghua.edu.cn/CRAN/"

#在R中安装devtools库
install.packages('devtools')
#在R中安装IRkernel
devtools::install_github('IRkernel/IRkernel')
或 直接用 install.packages('IRkernel') 来安装

#在R中激活,
#IRkernel::installspec()
IRkernel::installspec(name = 'ir33', displayname = 'R 3.3')  #也可以设定名字,这样子可以同时存在不同版本的R
[InstallKernelSpec] Installed kernelspec ir in /home/hpli/.local/share/jupyter/kernels/ir

之后打开jupyter notebook 就能看见R了,追加一个让R更好看的包
> install.packages("formatR", repos = "http://cran.rstudio.com")

之后退出R,打开jupyter notebook,就可以用R了

编辑于 2021-07-06 02:20