注意:今天的教程比较长,请规划好你的时间。本文是付费内容,在本文文末有本教程的全部的代码和示例数据。


输出结果

分析代码

关于WGCNA分析,如果你的数据量较大,建议使用服务期直接分析,本地分析可能导致R崩掉。

设置文件位置

setwd("~/00_WGCNA/20230217_WGCNA/WGCNA_01")

加载分析所需的安装包

install.packages("WGCNA")#BiocManager::install('WGCNA')library(WGCNA)options(stringsAsFactors = FALSE)

注意,如果你想打开多线程分析,可以使用一下代码

enableWGCNAThreads() 

一、导入基因表达量数据

## 读取txt文件格式数据WGCNA.fpkm = read.table("ExpData_WGCNA.txt",header=T,comment.char = "",check.names=F)################ 读取csv文件格式WGCNA.fpkm = read.csv("ExpData_WGCNA.csv", header = T, check.names = F)

数据处理

dim(WGCNA.fpkm)names(WGCNA.fpkm)datExpr0 = as.data.frame(t(WGCNA.fpkm[,-1]))names(datExpr0) = WGCNA.fpkm$sample;##########如果第一行不是ID命名,就写成fpkm[,1]rownames(datExpr0) = names(WGCNA.fpkm[,-1])

过滤数据

gsg = goodSamplesGenes(datExpr0, verbose = 3)gsg$allOKif (!gsg$allOK){if (sum(!gsg$goodGenes)>0)printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))if (sum(!gsg$goodSamples)>0)printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))# Remove the offending genes and samples from the data:datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]}

过滤低于设定的值的基因

##filtermeanFPKM=0.5###--过滤标准,可以修改n=nrow(datExpr0)datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean)datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM]# for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)filtered_fpkm=t(datExpr0)filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm)names(filtered_fpkm)[1]="sample"head(filtered_fpkm)write.table(filtered_fpkm, file="mRNA.filter.txt",row.names=F, col.names=T,quote=FALSE,sep="\t")

Sample cluster

sampleTree = hclust(dist(datExpr0), method = "average")pdf(file = "1.sampleClustering.pdf", width = 15, height = 8)par(cex = 0.6)par(mar = c(0,6,6,0))plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2, cex.axis = 1.5, cex.main = 2)### Plot a line to show the cut#abline(h = 180, col = "red")##剪切高度不确定,故无红线dev.off()

不过滤数据

如果你的数据不进行过滤直接进行一下操作,此步与前面的操作相同,任选异种即可。

## 不过滤## Determine cluster under the lineclust = cutreeStatic(sampleTree, cutHeight = 50000, minSize = 10)table(clust)# clust 1 contains the samples we want to keep.keepSamples = (clust!=0)datExpr0 = datExpr0[keepSamples, ]write.table(datExpr0, file="mRNA.symbol.uniq.filter.sample.txt",row.names=T, col.names=T,quote=FALSE,sep="\t")################Sample cluster###########sampleTree = hclust(dist(datExpr0), method = "average")pdf(file = "1.sampleClustering.filter.pdf", width = 12, height = 9)par(cex = 0.6)par(mar = c(0,4,2,0))plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)### Plot a line to show the cut#abline(h = 50000, col = "red")##剪切高度不确定,故无红线dev.off()

二、导入性状数据

traitData = read.table("TraitData.txt",row.names=1,header=T,comment.char = "",check.names=F)allTraits = traitDatadim(allTraits)names(allTraits)

## 形成一个类似于表达数据的数据框架fpkmSamples = rownames(datExpr0)traitSamples =rownames(allTraits)traitRows = match(fpkmSamples, traitSamples)datTraits = allTraits[traitRows,]rownames(datTraits)collectGarbage()

再次样本聚类

sampleTree2 = hclust(dist(datExpr0), method = "average")# Convert traits to a color representation: white means low, red means high, grey means missing entrytraitColors = numbers2colors(datTraits, signed = FALSE)

输出样本聚类图

pdf(file="2.Sample_dendrogram_and_trait_heatmap.pdf",width=20,height=12)plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap",cex.colorLabels = 1.5, cex.dendroLabels = 1, cex.rowText = 2)dev.off()

三、WGCNA分析(后面都是重点)

筛选软阈值

enableWGCNAThreads()# 设置soft-thresholding powers的数量powers = c(1:30)sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)

此步骤是比较耗费时间的,静静等待即可。

绘制soft Threshold plot

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"));text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red");# this line corresponds to using an R^2 cut-off of habline(h=0.8,col="red")# Mean connectivity as a function of the soft-thresholding powerplot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")dev.off()

选择softpower

选择softpower是一个玄学的过程,可以直接使用软件自己认为是最好的softpower值,但是不一定你要获得最好结果;其次,我们自己选择自己认为比较好的softpower值,但是,需要自己不断的筛选。因此,从这里开始WGCNA的分析结果就开始受到不同的影响。

## 选择软件认为是最好的softpower值#softPower =sft$powerEstimate---# 自己设定softpower值softPower = 9

继续分析

adjacency = adjacency(datExpr0, power = softPower)

将邻接转化为拓扑重叠

这一步建议去服务器上跑,后面的步骤就在服务器上跑吧,数据量太大;如果你的数据量较小,本地也就可以

TOM = TOMsimilarity(adjacency);dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average");

绘制聚类树(树状图)

pdf(file="4_Gene clustering on TOM-based dissimilarity.pdf",width=24,height=18)plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04)dev.off()

加入模块

minModuleSize = 30# Module identification using dynamic tree cut:dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize);table(dynamicMods)# Convert numeric lables into colorsdynamicColors = labels2colors(dynamicMods)table(dynamicColors)# Plot the dendrogram and colors underneath#sizeGrWindow(8,6)pdf(file="5_Dynamic Tree Cut.pdf",width=8,height=6)plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05,main = "Gene dendrogram and module colors")dev.off()

合并模块

做出的WGCNA分析中,具有较多的模块,但是在我们后续的分析中,是使用不到这么多的模块,以及模块越多对我们的分析越困难,那么就必须合并模块信息。具体操作如下。

MEList = moduleEigengenes(datExpr0, colors = dynamicColors)MEs = MEList$eigengenes# Calculate dissimilarity of module eigengenesMEDiss = 1-cor(MEs);# Cluster module eigengenesMETree = hclust(as.dist(MEDiss), method = "average")# Plot the result#sizeGrWindow(7, 6)pdf(file="6_Clustering of module eigengenes.pdf",width=7,height=6)plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")######剪切高度可修改MEDissThres = 0.4# Plot the cut line into the dendrogramabline(h=MEDissThres, col = "red")dev.off()


合并及绘图

 = mergeCloseModules(datExpr0, dynamicColors, cutHeight = MEDissThres, verbose = 3)# The merged module colorsmergedColors = merge$colors# Eigengenes of the new merged modules:mergedMEs = merge$newMEstable(mergedColors)#sizeGrWindow(12, 9)pdf(file="7_merged dynamic.pdf", width = 9, height = 6)plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)dev.off()

Rename to moduleColors

moduleColors = mergedColors# Construct numerical labels corresponding to the colorscolorOrder = c("grey", standardColors(50))moduleLabels = match(moduleColors, colorOrder)-1MEs = mergedMEs

性状数据与基因模块进行分析

nGenes = ncol(datExpr0)nSamples = nrow(datExpr0)moduleTraitCor = cor(MEs, datTraits, use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

绘制模块性状相关性图

pdf(file="8_Module-trait relationships.pdf",width=10,height=10)# Will display correlations and their p-valuestextMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3))# Display the correlation values within a heatmap plotlabeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))dev.off()

计算MM和GS

modNames = substring(names(MEs), 3)geneModuleMembership = as.data.frame(cor(datExpr0, MEs, use = "p"))MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))names(geneModuleMembership) = paste("MM", modNames, sep="")names(MMPvalue) = paste("p.MM", modNames, sep="")#names of those traittraitNames=names(datTraits)geneTraitSignificance = as.data.frame(cor(datExpr0, datTraits, use = "p"))GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))names(geneTraitSignificance) = paste("GS.", traitNames, sep="")names(GSPvalue) = paste("p.GS.", traitNames, sep="")

批量绘制性状与各个模块基因的相关性图

for (trait in traitNames){traitColumn=match(trait,traitNames)for (module in modNames){column = match(module, modNames)moduleGenes = moduleColors==moduleif (nrow(geneModuleMembership[moduleGenes,]) > 1){####进行这部分计算必须每个模块内基因数量大于2,由于前面设置了最小数量是30,这里可以不做这个判断,但是grey有可能会出现1个gene,它会导致代码运行的时候中断,故设置这一步#sizeGrWindow(7, 7)pdf(file=paste("9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)par(mfrow = c(1,1))verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, traitColumn]), xlab = paste("Module Membership in", module, "module"), ylab = paste("Gene significance for ",trait), main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)dev.off()}}}names(datExpr0)probes = names(datExpr0)


输出GS和MM数据

geneInfo0 = data.frame(probes= probes, moduleColor = moduleColors)for (Tra in 1:ncol(geneTraitSignificance)){oldNames = names(geneInfo0)geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra], GSPvalue[, Tra])names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra], names(GSPvalue)[Tra])}for (mod in 1:ncol(geneModuleMembership)){oldNames = names(geneInfo0)geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod], MMPvalue[, mod])names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod], names(MMPvalue)[mod])}geneOrder =order(geneInfo0$moduleColor)geneInfo = geneInfo0[geneOrder, ]write.table(geneInfo, file = "10_GS_and_MM.xls",sep="\t",row.names=F)

可视化基因网络

nGenes = ncol(datExpr0)nSamples = nrow(datExpr0)nSelect = 400# For reproducibility, we set the random seed 不能用全部的基因,不然会爆炸的set.seed(10)select = sample(nGenes, size = nSelect)selectTOM = dissTOM[select, select]selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select]#sizeGrWindow(9,9)# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7diag(plotDiss) = NA

绘图

library("gplots")pdf(file="13_Network heatmap plot_selected genes.pdf",width=9, height=9)mycol = colorpanel(250,'red','orange','lemonchiffon')TOMplot(plotDiss, selectTree, selectColors, col=mycol ,main = "Network heatmap plot, selected genes")dev.off()

特征基因的基因网络可视化

pdf(file="14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=7.5)par(cex = 0.9)plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)dev.off()



获得本教程代码链接:WGCNA分析 | 全流程分析代码 | 代码一

小杜的生信筆記 ,主要发表或收录生物信息学的教程,以及基于R的分析和可视化(包括数据分析,图形绘制等);分享感兴趣的文献和学习资料!!