文章目录
分析流程概述
参考文献:hppRNA—a Snakemake-based handy parameter-free pipeline for RNA-Seq analysis of numerous samples
六种主要流程示意图:
下载测试数据
wget ftp://ftp.ccb.jhu.edu/pub/RNAseq_protocol/chrX_data.tar.gz
如果文章中的数据不是fastq格式,而是给GSExxxx,这是需要下载SRA数据
需要安装sratoolkit,SRR_Acc_List.txt文件储存SRR号码
cat SRR_Acc_List.txt | while read id; do (prefetch ${id} &);done
# 批量转换sra到fq格式
ls /public/project/RNA/airway/sra/* | while read id; do ( nohup fastq-dump --gzip --split-3 -O ./ ${id} & ); done
下载的数据:
[sunchengquan 15:45:09 /data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples]
$ ll
总用量 1.8G
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188044_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188044_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 84M 1月 15 2016 ERR188104_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 85M 1月 15 2016 ERR188104_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 108M 1月 15 2016 ERR188234_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 109M 1月 15 2016 ERR188234_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 59M 1月 15 2016 ERR188245_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 60M 1月 15 2016 ERR188245_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 65M 1月 15 2016 ERR188257_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 66M 1月 15 2016 ERR188257_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 39M 1月 15 2016 ERR188273_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 39M 1月 15 2016 ERR188273_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188337_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188337_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 63M 1月 15 2016 ERR188383_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 63M 1月 15 2016 ERR188383_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 86M 1月 15 2016 ERR188401_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 87M 1月 15 2016 ERR188401_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 56M 1月 15 2016 ERR188428_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 56M 1月 15 2016 ERR188428_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 70M 1月 15 2016 ERR188454_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 70M 1月 15 2016 ERR188454_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 75M 1月 15 2016 ERR204916_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 75M 1月 15 2016 ERR204916_chrX_2.fastq.gz
数据质量控制
reads质量评估软件:fastqc生成质控报告,multiqc将各个样本的质控报告整合为一个。
reads质量控制软件:prinseq,cutadapt,trimmomatic,trim_galore
#!/usr/bin/env bash
set -e
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
if test -w $samples;then
mkdir -p {$samples/qc,$samples/cleandata/qc}
else
echo "没有写入权限"
fi
}
thread(){
tmp_fifofile="/tmp/$$.fifo" #脚本运行的当前进程ID号作为文件名
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile" #将fd6指向fifo类型
rm $tmp_fifofile
thread_num=$1 # 此处定义线程数
for((i=0;i<$thread_num;i++));do
echo
done >&6 # 事实上就是在fd6中放置了$thread个回车符
$2 6 $3
exec 6>&- # 关闭df6
}
qc(){
source activate RNA
printf "[%s %s %s %s %s %s]::数据质量评估\n" $(echo `date`)
start=$(date +%s.%N)
list=$(find $2 -name *q\.gz)
file_num=`ls -l $2/qc|wc -l`
if [ $file_num -lt 2 ];then
for i in $list;do
read -u$1
{
name=`awk -v each=$i 'BEGIN{split(each,arr,"/");l=length(arr);print arr[l]}' `
fastqc $i -o $2/qc &>> $2/qc/qc.log
printf "[%s %s %s %s %s %s]::%s质量评估完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
multiqc -d $2/qc -o $2/qc
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::数据质量评估耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
trim_qc(){
printf "[%s %s %s %s %s %s]::数据质量控制\n" $(echo `date`)
source activate RNA
start=$(date +%s.%N)
dir=$samples/cleandata
find $samples -name *1?f*q?gz|sort >$dir/1
find $samples -name *2?f*q?gz|sort >$dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
if [ $file_num -lt 3 ];then
for id in `cat $dir/config`;do
read -u$1
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
base_name=$(basename $fq1)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,"_");print arr[1]}' `
{
trim_galore -q 25 --phred33 --length 25 --stringency 3 --paired -o $dir $fq1 $fq2 &> $dir/trim.log
printf "[%s %s %s %s %s %s]::%s质量控制完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::数据质量控制耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
thread 6 qc $samples
thread 3 trim_qc
thread 6 qc $samples/cleandata
Tophat –> Cufflink –> Cuffdiff
流程图:
手动安装相关软件
我们已经使用bioconda安装相关的软件,现在手动安装一下,本流程所需要的软件
下载并安装比对软件bowtie2
cd ~/local/app
curl -OL http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.4/bowtie2-2.2.4-linux-x86_64.zip
unzip bowtie2-2.2.4-linux-x86_64.zip
把比对软件以及相关程序链接到bin文件夹
ln -s ~/local/app/bowtie2-2.2.4/bowtie2 ~/bin/
ln -s ~/local/app/bowtie2-2.2.4/bowtie2-align* ~/bin/
ln -s ~/local/app/bowtie2-2.2.4/bowtie2-build ~/bin/
安装tophat2
cd ~/local/app/
curl -OL http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz
tar zxvf tophat-2.0.13.Linux_x86_64.tar.gz
cd ~/bin/
vi tophat
#!/usr/bin/env bash
python2 ~/local/app/tophat-2.1.1.Linux_x86_64/tophat $@
chmod 755 tophat
保存退出
#注释文件
cd /data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes
curl -L ftp://ftp.ensembl.org/pub/release-77/gtf/homo_sapiens/Homo_sapiens.GRCh38.77.gtf.gz > hg38.gtf.gz
gunzip *.gz
cat hg38.gtf | awk ' $1 =="X" { print $0 }' > chr_X.gtf
安装cufflinks
cd ~/local/app
curl -OL http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.1.1.Linux_x86_64.tar.gz
tar zxvf cufflinks-2.1.1.Linux_x86_64.tar.gz
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cufflinks ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffdiff ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/gtf_to_sam ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffcompare ~/bin
cd ~/bin
vi cuffmerge
#!/usr/bin/env bash
python2 ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffmerge $@
chmod 755 cuffmerge
流程代码
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/tophat+cuff
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/bowtie,$output/1_tophat,$output/2_cufflinks,$output/3_cuffdiff}
fi
cuffdiff=$output/3_cuffdiff
indexes=$index/bowtie/chrX
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chrX.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引bowtie2-build\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/bowtie|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
bowtie2-build -f $genome $index/bowtie/$name &> $index/bowtie/index.log
ln -s $genome $index/bowtie/$basename
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::与参考基因组比对tophat\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_tophat
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
tophat -p 8 -G $gene -o $dir/$name $indexes $fq1 $fq2 &>> $dir/mapping.log
printf "[%s %s %s %s %s %s]::%s比对完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比对耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
assemble(){
printf "[%s %s %s %s %s %s]::转录本组装和定量cufflinks\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_cufflinks
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $output/1_tophat/config);do
fq1=$(echo $id|cut -d":" -f1)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
cufflinks -p 8 -g $gene -o $dir/$name $output/1_tophat/$name/accepted_hits.bam &> $dir/$name.log
printf "[%s %s %s %s %s %s]::%s转录本组装完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本组装耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
merge(){
printf "[%s %s %s %s %s %s]::转录本合并cuffmerge\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_cufflinks
find $dir -name *transcripts?gtf|sort > $dir/assemblies.txt
source activate RNA
if [ ! -d $dir/merged_asm ];then
cuffmerge -p 8 -o $dir/merged_asm -g $gene -s $genome $dir/assemblies.txt &> $dir/cuffmerge.log
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本合并耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
diff(){
printf "[%s %s %s %s %s %s]::差异分析cuffdiff\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/3_cuffdiff
S1=$output/1_tophat/ERR188245/accepted_hits.bam;S2=$output/1_tophat/ERR188428/accepted_hits.bam;S3=$output/1_tophat/ERR188337/accepted_hits.bam
S4=$output/1_tophat/ERR204916/accepted_hits.bam;S5=$output/1_tophat/ERR188234/accepted_hits.bam;S6=$output/1_tophat/ERR188273/accepted_hits.bam
S7=$output/1_tophat/ERR188401/accepted_hits.bam;S8=$output/1_tophat/ERR188257/accepted_hits.bam;S9=$output/1_tophat/ERR188383/accepted_hits.bam
S10=$output/1_tophat/ERR188454/accepted_hits.bam;S11=$output/1_tophat/ERR188104/accepted_hits.bam;S12=$output/1_tophat/ERR188044/accepted_hits.bam
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
cuffdiff -p 8 -b $genome -o $dir -L Female,Male -u $output/2_cufflinks/merged_asm/merged.gtf $S1,$S2,$S3,$S4,$S5,$S6 $S7,$S8,$S9,$S10,$S11,$S12 &> $dir/cuffdiff.log
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::差异分析cuffdiff耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
expression_matrix(){
dir=$output/3_cuffdiff
expr=$dir/gene_exp.diff
##筛选出下调的基因(log2_fold_change < -2 & pvalue < 0.001)
awk '{if(($10<-2)&&($11<0.001))print $3"\t"$8"\t"$9"\t"$10}' $dir/gene_exp.diff | grep -v 'inf' > $dir/down.txt
## 筛选出上调的基因(log2_fold_change > 2 & pvalue < 0.001
awk '{if(($10>2)&&($11<0.001))print $3"\t"$8"\t"$9"\t"$10}' $dir/gene_exp.diff | grep -v 'inf' > $dir/up.txt
}
settings
index
thread 4 mapping
thread 4 assemble
merge
diff
Subread -> featureCounts -> DESeq2
流程代码
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/subread+featurecounts
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/subread,$output/1_subjunc,$output/2_featurecounts}
else
echo "没有写入权限"
fi
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chr_X.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引subread-buildindex\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/subread|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
subread-buildindex -o $index/subread/$name $genome &> $index/subread/index.log
fi
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗时%.2f分钟\n" $(echo `date`) $dur
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::与参考基因组比对subjunc\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_subjunc
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
index_prefix=`awk -v each=$(basename $genome) 'BEGIN{split(each,arr,".");print arr[1]}' `
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
subjunc -T 5 -i $index/subread/$index_prefix -r $fq1 -R $fq2 -o $dir/${name}.subjunc.bam &> $dir/${name}.log
printf "[%s %s %s %s %s %s]::%s比对完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比对耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
count(){
printf "[%s %s %s %s %s %s]:: 转录本定量featureCounts\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_featurecounts
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
featureCounts -T 5 -p -t exon -g gene_id -a $gene -o $dir/all.id.txt $output/1_subjunc/*.bam 1>$dir/counts.log 2>&1
# -g 默认gene_id, 可以选择gene_name,后面可以用R将ID转换成gene symbol
# 这样得到的all.id.txt文件就是表达矩阵,但是,这个featureCounts有非常多的参数可以调整
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本定量耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
index
thread 3 mapping
count
DESeq2差异分析
差异分析需要的包:
- limma
- edgeR
- gplots
- org.Mm.eg.db
- RColorBrewer
- Glimma
- DESeq2
DESeq2差异分析的三个主要数据:
- 表达矩阵(原始的count数据)
- 分组矩阵(实验设计的分组情况)
- 差异比较矩阵(指定差异比较对象)
DESeq2差异分析的三个主要步骤:
- 构建
DESeqDataSet
(dds对象) DESeq()
函数进行差异分析results()
函数提取差异分析结果
构建
DESeqDataSet
对象有两种方式,一种是通过DESeqDataSet()
函数将SummarizedExperiment
->DESeqDataSet
,如果我们只有表达矩阵和样本表型数据,如何构建DESeqDataSet数据集,可以通过另一种方式,使用DESeqDataSetFromMatrix()
函数将表达矩阵和样本表型数据转化为DESeqDataSet
数据对象。
colData
里面的信息,就相当于我们自己准备的表型数据,建议是csv,tsv格式(comma-separated value;tab- separated value)文件,可以把样本表型的数据赋值给colData
对象
suppressMessages(library(edgeR))
suppressMessages(library(limma))
suppressMessages(library(Glimma))
suppressMessages(library(gplots))
suppressMessages(library(DESeq2))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(RColorBrewer))
读取数据,提取表达矩阵
#读取count table数据
data <- read.table("all.id.txt",stringsAsFactors = F, header = T)
meta = data[,1:6]
exprSet = data[,7:ncol(data)]
rownames(exprSet) <- data[,1]
colnames(exprSet)
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188044.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188104.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188234.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188245.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188257.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188273.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188337.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188383.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188401.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188428.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188454.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR204916.subjunc.bam'
class(exprSet)
‘data.frame’
修改列名
# suppressMessages(library(dplyr))
# name <- lapply(strsplit(as.character(colnames(exprSet)),'[.]'),function(x){x[length(x)-2]}) %>% unlist
name <- unlist(lapply(strsplit(as.character(colnames(exprSet)),'[.]'),function(x){x[length(x)-2]}))
colnames(exprSet) <- name
colnames(exprSet)
- 'ERR188044'
- 'ERR188104'
- 'ERR188234'
- 'ERR188245'
- 'ERR188257'
- 'ERR188273'
- 'ERR188337'
- 'ERR188383'
- 'ERR188401'
- 'ERR188428'
- 'ERR188454'
- 'ERR204916'
head(exprSet,6)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000226179 | 13 | 13 | 9 | 4 | 1 | 2 | 6 | 3 | 8 | 4 | 7 | 10 |
ENSG00000167393 | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000275287 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
exprSet[rownames(exprSet)=="ENSG00000205755",]
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000205755 | 9 | 10 | 5 | 3 | 4 | 2 | 8 | 4 | 6 | 3 | 2 | 5 |
查看数据维度,即几行几列
dim(exprSet)
- 2450
- 12
表型数据读取,样本分组信息
# Read the sample information into R
coldata <- read.csv("geuvadis_phenodata.csv")
coldata
ids | sex | population |
---|---|---|
ERR188044 | male | YRI |
ERR188104 | male | YRI |
ERR188234 | female | YRI |
ERR188245 | female | GBR |
ERR188257 | male | GBR |
ERR188273 | female | YRI |
ERR188337 | female | GBR |
ERR188383 | male | GBR |
ERR188401 | male | GBR |
ERR188428 | female | GBR |
ERR188454 | male | YRI |
ERR204916 | female | YRI |
coldata$group <- factor(paste0(coldata$sex, coldata$population))
coldata
ids | sex | population | group |
---|---|---|---|
ERR188044 | male | YRI | maleYRI |
ERR188104 | male | YRI | maleYRI |
ERR188234 | female | YRI | femaleYRI |
ERR188245 | female | GBR | femaleGBR |
ERR188257 | male | GBR | maleGBR |
ERR188273 | female | YRI | femaleYRI |
ERR188337 | female | GBR | femaleGBR |
ERR188383 | male | GBR | maleGBR |
ERR188401 | male | GBR | maleGBR |
ERR188428 | female | GBR | femaleGBR |
ERR188454 | male | YRI | maleYRI |
ERR204916 | female | YRI | femaleYRI |
suppressMessages(library(pheatmap))
suppressMessages(library(corrplot))
可视化样本间的相似性
options(repr.plot.width = 5, repr.plot.height = 5)
# png('cor.png')
corrplot(cor(log2(exprSet+1)))
# dev.off()
m=cor(log2(exprSet+1))
#cor矩阵用scale归一化
pheatmap(scale(cor(log2(exprSet+1))))
构建DESeqDataSet(dds)对象
#将读取数据由data.frame转换成matrix类型
countdata <- as.matrix(exprSet)
#将countdata转换成DESeq2的数据格式(构建dds)
dds <- DESeqDataSetFromMatrix(countData = countdata,colData = coldata, design = ~ group )
dds
class: DESeqDataSet
dim: 2450 12
metadata(1): version
assays(1): counts
rownames(2450): ENSG00000228572 ENSG00000182378 ... ENSG00000276543
ENSG00000227159
rowData names(0):
colnames(12): ERR188044 ERR188104 ... ERR188454 ERR204916
colData names(4): ids sex population group
#将所有样本中count值均为0的基因所在行去除
dds <- dds[rowSums(counts(dds) > 0) != 0 ,] #dds[rowSums(assay(dds) > 0) != 0 , ]
dds
class: DESeqDataSet
dim: 1412 12
metadata(1): version
assays(1): counts
rownames(1412): ENSG00000228572 ENSG00000182378 ... ENSG00000182484
ENSG00000227159
rowData names(0):
colnames(12): ERR188044 ERR188104 ... ERR188454 ERR204916
colData names(4): ids sex population group
# DESeq分析,大小因子的估计,离差估计,负二项分布的拟合以及计算相应的统计量
dds <- DESeq(dds, parallel = T) #parallel = T程序并行,提高速度
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 1 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 1 workers
plotDispEsts(dds, main="Dispersion plot")
resultsNames(dds)
- 'Intercept'
- 'group_femaleYRI_vs_femaleGBR'
- 'group_maleGBR_vs_femaleGBR'
- 'group_maleYRI_vs_femaleGBR'
使用rlog-transformed归一化数据
rld <- rlog(dds)
countdata_rlog <- assay(rld)
head(assay(rld))
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | -1.716339 | -1.743507 | -1.743821 | -1.737338 | -1.739732 | -1.731162 | -1.716953 | -1.740015 | -1.744537 | -1.736857 | -1.713780 | -1.740743 |
ENSG00000182378 | 11.461408 | 10.989926 | 11.138604 | 10.963768 | 10.327285 | 11.675548 | 11.168207 | 10.169948 | 10.611627 | 10.418765 | 10.644815 | 11.174434 |
ENSG00000178605 | 10.007361 | 9.640540 | 10.095996 | 10.269333 | 10.421237 | 9.647242 | 10.331475 | 10.256303 | 10.378303 | 9.782669 | 10.204764 | 10.367591 |
ENSG00000226179 | 2.753014 | 2.747507 | 2.620197 | 2.538756 | 2.352026 | 2.503809 | 2.517179 | 2.450266 | 2.573627 | 2.546540 | 2.612662 | 2.723157 |
ENSG00000167393 | 7.540045 | 7.216756 | 7.476484 | 8.109476 | 7.458899 | 7.303154 | 7.627266 | 6.576983 | 7.734151 | 7.262772 | 6.894270 | 7.775687 |
ENSG00000275287 | -2.216337 | -2.207251 | -2.216517 | -2.214314 | -2.215128 | -2.212215 | -2.216546 | -2.215224 | -2.216760 | -2.214150 | -2.215468 | -2.215471 |
使用rlog-transformed 与 正常 区别
options(repr.plot.width = 9, repr.plot.height = 9)
par(cex = 0.7)
n.sample=ncol(countdata)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(countdata, col = cols,main="expression value",las=2)
boxplot(countdata_rlog, col = cols,main="expression value",las=2)
hist(countdata)
hist(countdata_rlog)
热图呈现样品间的距离
mycols <- brewer.pal(8, "Dark2")[1:length(unique(coldata$group))]
sampleDists <- as.matrix(dist(t(countdata_rlog)))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[coldata$group], RowSideColors=mycols[coldata$group],
margin=c(10, 10), main="Sample Distance Matrix")
差异分析及结果提取
#提取性别间差异分析的结果
sex_Y_result <- results(dds, contrast = c("group","maleYRI","femaleYRI"), parallel = T)
#?results 查看用法
sex_G_result <- results(dds, contrast = c("group","maleGBR","femaleGBR"), parallel = T)
sex_G_result
log2 fold change (MLE): group maleGBR vs femaleGBR
Wald test p-value: group maleGBR vs femaleGBR
DataFrame with 1412 rows and 6 columns
baseMean log2FoldChange lfcSE
<numeric> <numeric> <numeric>
ENSG00000228572 0.212064658186015 -1.18134449838418 4.40721523394822
ENSG00000182378 2080.8358537599 -0.759383223494156 0.369671551206202
ENSG00000178605 1146.43865301777 0.288751709902595 0.319524288527945
ENSG00000226179 6.08058463520939 -0.493246072149153 0.820198203867892
ENSG00000167393 183.376192726023 -0.562850597829534 0.484234566315877
... ... ... ...
ENSG00000124334 60.2368766837945 0.489304618353469 0.581968571149684
ENSG00000270726 0.209832834505338 -0.22070381980218 4.40735632395141
ENSG00000185203 1.03705472979752 1.36980197787972 2.33437609518548
ENSG00000182484 493.281659491792 0.264865107532117 0.359069049586416
ENSG00000227159 3.18504610277688 -0.645544611222065 1.25294052192237
stat pvalue padj
<numeric> <numeric> <numeric>
ENSG00000228572 -0.268047834216135 0.788662499091892 0.992519295917494
ENSG00000182378 -2.05421061214032 0.0399553117393148 0.992519295917494
ENSG00000178605 0.903692521256771 0.366158465847581 0.992519295917494
ENSG00000226179 -0.601374240790949 0.547590751585147 0.992519295917494
ENSG00000167393 -1.16235113513638 0.245092863468186 0.992519295917494
... ... ... ...
ENSG00000124334 0.840774987877512 0.400474001859852 0.992519295917494
ENSG00000270726 -0.0500762369955846 0.960061636105264 0.992519295917494
ENSG00000185203 0.58679575270877 0.557340889682515 0.992519295917494
ENSG00000182484 0.737643937390858 0.460730848276013 0.992519295917494
ENSG00000227159 -0.515223667785613 0.606396732014109 0.992519295917494
结果说明
- 第一列基因名
- baseMean 经过矫正的reads count均值
- log2FoldChange 对差异倍数取以2为底的对数
- lfcSE 差异倍数取对数后的标准差
- stat Wald统计量,由log2FoldChange除以标准差所得
- pvalue 原始的p值
- padj 校正后的p值
summary(sex_G_result)
out of 1412 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2, 0.14%
LFC < 0 (down) : 13, 0.92%
outliers [1] : 1, 0.071%
low counts [2] : 0, 0%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
#在群体间差异分析的基因分析结果
population_F_result <- results(dds, contrast = c("group","femaleYRI","femaleGBR"), parallel = T)
population_M_result <- results(dds, contrast = c("group","maleYRI","maleGBR"), parallel = T)
#提取性别间差异表达的基因,并存入文件
sex_Y_result_0.01 <- sex_Y_result[which(sex_Y_result$pvalue < 0.01),]
write.csv(sex_Y_result_0.01, file="DE/sex_Y_result_0.01_DESeq2.csv")
#提取种群间差异表达的基因,并存入文件
population_F_result_0.01 <- subset(population_F_result,pvalue < 0.01)
write.csv(population_F_result_0.01, file="DE/population_F_result_0.01_DESeq2.csv")
流程参考文献RNA-Seq workflow: gene-level exploratory analysis and differential expression
edgeR差异分析
edgeR差异分析的三个主要数据:
- 表达矩阵(原始的count数据)
- 分组矩阵(实验设计的分组情况)
- 差异比较矩阵(指定差异比较对象)
edgeR差异分析的五个主要步骤:
- DGElist()函数构造DGEList对象,转化counts 成DGEList对象
- model.matrix()函数构建设计矩阵
- estimateDisp()函数进行差异分析
- glmQLFit()函数构建DGEGLM对象
- glmQLFTest()函数进行QL F-test
读取数据,提取表达矩阵
上面已经做过了
表型数据读取,样本分组信息
上面已经做过了
head(countdata,3)
dim(countdata)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
- 2450
- 12
# rm(list=ls())
ls('package:org.Hs.eg.db')
- 'org.Hs.eg'
- 'org.Hs.eg_dbconn'
- 'org.Hs.eg_dbfile'
- 'org.Hs.eg_dbInfo'
- 'org.Hs.eg_dbschema'
- 'org.Hs.eg.db'
- 'org.Hs.egACCNUM'
- 'org.Hs.egACCNUM2EG'
- 'org.Hs.egALIAS2EG'
- 'org.Hs.egCHR'
- 'org.Hs.egCHRLENGTHS'
- 'org.Hs.egCHRLOC'
- 'org.Hs.egCHRLOCEND'
- 'org.Hs.egENSEMBL'
- 'org.Hs.egENSEMBL2EG'
- 'org.Hs.egENSEMBLPROT'
- 'org.Hs.egENSEMBLPROT2EG'
- 'org.Hs.egENSEMBLTRANS'
- 'org.Hs.egENSEMBLTRANS2EG'
- 'org.Hs.egENZYME'
- 'org.Hs.egENZYME2EG'
- 'org.Hs.egGENENAME'
- 'org.Hs.egGO'
- 'org.Hs.egGO2ALLEGS'
- 'org.Hs.egGO2EG'
- 'org.Hs.egMAP'
- 'org.Hs.egMAP2EG'
- 'org.Hs.egMAPCOUNTS'
- 'org.Hs.egOMIM'
- 'org.Hs.egOMIM2EG'
- 'org.Hs.egORGANISM'
- 'org.Hs.egPATH'
- 'org.Hs.egPATH2EG'
- 'org.Hs.egPFAM'
- 'org.Hs.egPMID'
- 'org.Hs.egPMID2EG'
- 'org.Hs.egPROSITE'
- 'org.Hs.egREFSEQ'
- 'org.Hs.egREFSEQ2EG'
- 'org.Hs.egSYMBOL'
- 'org.Hs.egSYMBOL2EG'
- 'org.Hs.egUCSCKG'
- 'org.Hs.egUNIGENE'
- 'org.Hs.egUNIGENE2EG'
- 'org.Hs.egUNIPROT'
entrezID2ensemblID <- toTable(org.Hs.egENSEMBL)
head(entrezID2ensemblID)
gene_id | ensembl_id |
---|---|
1 | ENSG00000121410 |
2 | ENSG00000175899 |
3 | ENSG00000256069 |
9 | ENSG00000171428 |
10 | ENSG00000156006 |
12 | ENSG00000196136 |
entrezID2symbol <- toTable(org.Hs.egSYMBOL)
head(entrezID2symbol)
gene_id | symbol |
---|---|
1 | A1BG |
2 | A2M |
3 | A2MP1 |
9 | NAT1 |
10 | NAT2 |
11 | NATP |
columns(org.Hs.eg.db)
- 'ACCNUM'
- 'ALIAS'
- 'ENSEMBL'
- 'ENSEMBLPROT'
- 'ENSEMBLTRANS'
- 'ENTREZID'
- 'ENZYME'
- 'EVIDENCE'
- 'EVIDENCEALL'
- 'GENENAME'
- 'GO'
- 'GOALL'
- 'IPI'
- 'MAP'
- 'OMIM'
- 'ONTOLOGY'
- 'ONTOLOGYALL'
- 'PATH'
- 'PFAM'
- 'PMID'
- 'PROSITE'
- 'REFSEQ'
- 'SYMBOL'
- 'UCSCKG'
- 'UNIGENE'
- 'UNIPROT'
help('select')
ann <- select(org.Hs.eg.db,keys = rownames(countdata), keytype="ENSEMBL",columns = c("ENSEMBL","SYMBOL","GENENAME"))
'select()' returned 1:many mapping between keys and columns
head(ann)
dim(ann)
ENSEMBL | SYMBOL | GENENAME |
---|---|---|
ENSG00000228572 | NA | NA |
ENSG00000182378 | PLCXD1 | phosphatidylinositol specific phospholipase C X domain containing 1 |
ENSG00000178605 | GTPBP6 | GTP binding protein 6 (putative) |
ENSG00000226179 | NA | NA |
ENSG00000167393 | PPP2R3B | protein phosphatase 2 regulatory subunit B''beta |
ENSG00000275287 | NA | NA |
- 2479
- 3
coldata
ids | sex | population | group |
---|---|---|---|
ERR188044 | male | YRI | maleYRI |
ERR188104 | male | YRI | maleYRI |
ERR188234 | female | YRI | femaleYRI |
ERR188245 | female | GBR | femaleGBR |
ERR188257 | male | GBR | maleGBR |
ERR188273 | female | YRI | femaleYRI |
ERR188337 | female | GBR | femaleGBR |
ERR188383 | male | GBR | maleGBR |
ERR188401 | male | GBR | maleGBR |
ERR188428 | female | GBR | femaleGBR |
ERR188454 | male | YRI | maleYRI |
ERR204916 | female | YRI | femaleYRI |
过滤表达量低的基因
在这个数据集中,我们选择保留至少在两个样本中的CPM(counts-per-million)值大于0.5的基因(根据实际情况作调整)
使用edgeR
包中的cpm
函数
myCPM <- cpm(countdata)
# Have a look at the output
head(myCPM)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 0.8414109 | 0.0000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.9116842 | 0.000000 | 0.0000 | 0.00000 | 1.058151 | 0.00000 |
ENSG00000182378 | 3473.3441034 | 2263.3743096 | 2255.726656 | 2125.22542 | 1185.821950 | 3899.517816 | 2963.8854523 | 976.066093 | 1629.1449 | 1167.61147 | 1605.214567 | 2623.31454 |
ENSG00000178605 | 1039.1424340 | 730.5070523 | 1010.692058 | 1348.62557 | 1692.180876 | 649.603988 | 1633.7381515 | 1404.362993 | 1648.4198 | 788.82113 | 1320.572036 | 1489.62082 |
ENSG00000226179 | 10.9383414 | 11.1201308 | 6.469579 | 5.21208 | 1.177579 | 3.787778 | 5.4701054 | 3.381291 | 6.7043 | 5.20674 | 7.407055 | 10.14037 |
ENSG00000167393 | 204.4628433 | 147.9832787 | 169.646747 | 367.45161 | 201.365992 | 143.935577 | 254.3599019 | 67.625826 | 274.0383 | 144.48704 | 101.582464 | 261.62163 |
ENSG00000275287 | 0.0000000 | 0.8553947 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000000 | 0.000000 | 0.0000 | 0.00000 | 0.000000 | 0.00000 |
# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 0.5
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | TRUE | FALSE | FALSE | FALSE | TRUE | FALSE |
ENSG00000182378 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000178605 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000226179 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000167393 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000275287 | FALSE | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
# Summary of how many TRUEs there are in each row
# There are 11433 genes that have TRUEs in all 12 samples.
table(rowSums(thresh))
0 1 2 3 4 5 6 7 8 9 10 11 12
1038 208 106 79 58 45 40 44 28 34 40 58 672
# we would like to keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2
# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)
Mode FALSE TRUE
logical 1246 1204
dim(counts.keep)
- 1204
- 12
构建DGEList对象(edgeR的数据分析格式)
edgeR_data <- DGEList(counts = counts.keep, group = coldata$group)
# See what slots are stored in y
names(edgeR_data)
head(edgeR_data$counts)
- 'counts'
- 'samples'
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000226179 | 13 | 13 | 9 | 4 | 1 | 2 | 6 | 3 | 8 | 4 | 7 | 10 |
ENSG00000167393 | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000237531 | 0 | 1 | 4 | 0 | 0 | 0 | 4 | 0 | 0 | 6 | 2 | 0 |
rownames(edgeR_data$counts) <- ann[match(rownames(edgeR_data$counts),ann$ENSEMBL),2]
head(edgeR_data$counts)
# dim(edgeR_data$counts)
# a <- ann[ann$ENSEMBL %in% rownames(edgeR_data$counts) , ]
# head(a)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
NA | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
PLCXD1 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
GTPBP6 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
NA | 13 | 13 | 9 | 4 | 1 | 2 | 6 | 3 | 8 | 4 | 7 | 10 |
PPP2R3B | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
NA | 0 | 1 | 4 | 0 | 0 | 0 | 4 | 0 | 0 | 6 | 2 | 0 |
edgeR_data$counts <- edgeR_data$counts[which(rownames(edgeR_data$counts) != 'NA'), ]
head(edgeR_data$counts)
dim(edgeR_data$counts)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
PLCXD1 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
GTPBP6 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
PPP2R3B | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
CRLF2 | 9 | 10 | 5 | 3 | 4 | 2 | 8 | 4 | 6 | 3 | 2 | 5 |
CSF2RA | 7 | 2 | 3 | 1 | 1 | 2 | 1 | 6 | 2 | 0 | 0 | 7 |
IL3RA | 651 | 664 | 195 | 202 | 455 | 216 | 283 | 413 | 693 | 221 | 593 | 620 |
- 697
- 12
# Library size information is stored in the samples slot
edgeR_data$samples
group | lib.size | norm.factors | |
---|---|---|---|
ERR188044 | maleYRI | 1188446 | 1 |
ERR188104 | maleYRI | 1169031 | 1 |
ERR188234 | femaleYRI | 1391106 | 1 |
ERR188245 | femaleGBR | 767431 | 1 |
ERR188257 | maleGBR | 849187 | 1 |
ERR188273 | femaleYRI | 528000 | 1 |
ERR188337 | femaleGBR | 1096838 | 1 |
ERR188383 | maleGBR | 887227 | 1 |
ERR188401 | maleGBR | 1193246 | 1 |
ERR188428 | femaleGBR | 768206 | 1 |
ERR188454 | maleYRI | 945031 | 1 |
ERR204916 | femaleYRI | 986132 | 1 |
可以自己手动计算一下,看一致否
apply(counts.keep,2,sum)
- ERR188044
- 1188446
- ERR188104
- 1169031
- ERR188234
- 1391106
- ERR188245
- 767431
- ERR188257
- 849187
- ERR188273
- 528000
- ERR188337
- 1096838
- ERR188383
- 887227
- ERR188401
- 1193246
- ERR188428
- 768206
- ERR188454
- 945031
- ERR204916
- 986132
质控
文库大小及其分布图
edgeR_data$samples$lib.size
- 1188446
- 1169031
- 1391106
- 767431
- 849187
- 528000
- 1096838
- 887227
- 1193246
- 768206
- 945031
- 986132
# The names argument tells the barplot to use the sample names on the x-axis
# The las argument rotates the axis names
options(repr.plot.width = 6, repr.plot.height = 4)
barplot(edgeR_data$samples$lib.size,names=colnames(edgeR_data),las=2)
title("Barplot of library sizes")
# Get log2 counts per million
logcounts <- cpm(edgeR_data,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")
Multidimensional scaling plots
A principle components analysis is an example of an unsupervised analysis, where we don’t need to specify the groups.
plotMDS(edgeR_data)
按组添加颜色
# We specify the option to let us plot two plots side-by-sde
par(mfrow=c(1,2))
options(repr.plot.width = 9, repr.plot.height = 5)
# Let's set up colour schemes for sex
levels(coldata$sex)
## Let's choose purple for basal and orange for luminal
col.sex <- c("purple","orange")[coldata$sex]
data.frame(coldata$sex,col.sex)
# Redo the MDS with cell type colouring
plotMDS(edgeR_data,col=col.sex)
# Let's add a legend to the plot so we know which colours correspond to which sex
legend("topleft",fill=c("purple","orange"),legend=levels(coldata$sex))
# Add a title
title("sex")
# Similarly for population
levels(coldata$population)
col.population <- c("blue","red")[coldata$population]
plotMDS(edgeR_data,col=col.population)
legend("topleft",fill=c("blue","red"),legend=levels(coldata$population),cex=0.8)
title("population")
- 'female'
- 'male'
coldata.sex | col.sex |
---|---|
male | orange |
male | orange |
female | purple |
female | purple |
male | orange |
female | purple |
female | purple |
male | orange |
male | orange |
female | purple |
male | orange |
female | purple |
- 'GBR'
- 'YRI'
整合成一张图
cols <- c("#1B9E77", "#D95F02")
col.population <- c("#1B9E77", "#D95F02")[coldata$population]
char.sex <- c(1,4)[coldata$sex]
char.sex
- 4
- 4
- 1
- 1
- 4
- 1
- 1
- 4
- 4
- 1
- 4
- 1
options(repr.plot.width = 6, repr.plot.height = 5)
plotMDS(edgeR_data,dim=c(1,2),col=col.population,pch=char.sex,cex=2)
legend("topright",legend=levels(coldata$population),col=cols,pch=16)
legend("top",legend=levels(coldata$sex),pch=c(1,4))
标准化,归一化
Normalisation for composition bias
# Apply normalisation to DGEList object
edgeR_data <- calcNormFactors(edgeR_data)
This will update the normalisation factors in the DGEList object (their default values are 1). Take a look at the normalisation factors for these samples.
edgeR_data$samples
group | lib.size | norm.factors | |
---|---|---|---|
ERR188044 | maleYRI | 1188446 | 1.0098209 |
ERR188104 | maleYRI | 1169031 | 1.0000360 |
ERR188234 | femaleYRI | 1391106 | 0.9027058 |
ERR188245 | femaleGBR | 767431 | 0.9373145 |
ERR188257 | maleGBR | 849187 | 1.0644447 |
ERR188273 | femaleYRI | 528000 | 0.9306171 |
ERR188337 | femaleGBR | 1096838 | 1.0971591 |
ERR188383 | maleGBR | 887227 | 1.0314882 |
ERR188401 | maleGBR | 1193246 | 1.0766819 |
ERR188428 | femaleGBR | 768206 | 0.9556656 |
ERR188454 | maleYRI | 945031 | 1.0344366 |
ERR204916 | femaleYRI | 986132 | 0.9808063 |
ERR188234样本的normalisation factors
最小,ERR188337样本的normalisation factors
最大,
If we plot mean difference plots using the plotMD function for these samples, we should be able to see the composition bias problem. We will use the logcounts, which have been normalised for library size, but not for composition bias.
options(repr.plot.width = 9, repr.plot.height = 5)
par(mfrow=c(1,2))
plotMD(logcounts,column = 3)
abline(h=0,col="grey")
plotMD(logcounts,column = 7)
abline(h=0,col="grey")
The mean-difference plots show average expression (mean: x-axis) against log-fold-changes (difference: y-axis). Because our DGEList object contains the normalisation factors, if we redo these plots using edgeR_data
, we should see the composition bias problem has been solved.
par(mfrow=c(1,2))
plotMD(edgeR_data,column = 3)
abline(h=0,col="grey")
plotMD(edgeR_data,column = 7)
abline(h=0,col="grey")
建立设计矩阵
design <- model.matrix(~0 + group , data = coldata)
design
groupfemaleGBR | groupfemaleYRI | groupmaleGBR | groupmaleYRI | |
---|---|---|---|---|
1 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 0 | 1 |
3 | 0 | 1 | 0 | 0 |
4 | 1 | 0 | 0 | 0 |
5 | 0 | 0 | 1 | 0 |
6 | 0 | 1 | 0 | 0 |
7 | 1 | 0 | 0 | 0 |
8 | 0 | 0 | 1 | 0 |
9 | 0 | 0 | 1 | 0 |
10 | 1 | 0 | 0 | 0 |
11 | 0 | 0 | 0 | 1 |
12 | 0 | 1 | 0 | 0 |
#修改设计矩阵的列名,行名
colnames(design)
- 'groupfemaleGBR'
- 'groupfemaleYRI'
- 'groupmaleGBR'
- 'groupmaleYRI'
levels(coldata$group)
- 'femaleGBR'
- 'femaleYRI'
- 'maleGBR'
- 'maleYRI'
colnames(design) <- levels(coldata$group)
colnames(design)
rownames(design) <- coldata$id
rownames(design)
design
- 'femaleGBR'
- 'femaleYRI'
- 'maleGBR'
- 'maleYRI'
- 'ERR188044'
- 'ERR188104'
- 'ERR188234'
- 'ERR188245'
- 'ERR188257'
- 'ERR188273'
- 'ERR188337'
- 'ERR188383'
- 'ERR188401'
- 'ERR188428'
- 'ERR188454'
- 'ERR204916'
femaleGBR | femaleYRI | maleGBR | maleYRI | |
---|---|---|---|---|
ERR188044 | 0 | 0 | 0 | 1 |
ERR188104 | 0 | 0 | 0 | 1 |
ERR188234 | 0 | 1 | 0 | 0 |
ERR188245 | 1 | 0 | 0 | 0 |
ERR188257 | 0 | 0 | 1 | 0 |
ERR188273 | 0 | 1 | 0 | 0 |
ERR188337 | 1 | 0 | 0 | 0 |
ERR188383 | 0 | 0 | 1 | 0 |
ERR188401 | 0 | 0 | 1 | 0 |
ERR188428 | 1 | 0 | 0 | 0 |
ERR188454 | 0 | 0 | 0 | 1 |
ERR204916 | 0 | 1 | 0 | 0 |
估计离散度
edgeR_data <- estimateDisp(edgeR_data, design)
names(edgeR_data)
- 'counts'
- 'samples'
- 'design'
- 'common.dispersion'
- 'trended.dispersion'
- 'tagwise.dispersion'
- 'AveLogCPM'
- 'trend.method'
- 'prior.df'
- 'prior.n'
- 'span'
构建DGEGLM对象
fit <- glmQLFit(edgeR_data, design)
建立对照
?makeContrasts
my.contrasts <- makeContrasts(YRI.f.vs.m = femaleYRI - maleYRI, GBR.f.vs.m = femaleGBR - maleGBR, f.YRI.vs.GBR = femaleYRI - femaleGBR, m.YRI.vs.GBR = maleYRI - maleGBR, levels = design)
差异分析(用quasi-likelihood(QL) F-test)
qlf_YRI.f.vs.m <- glmQLFTest(fit, contrast = my.contrasts[, "YRI.f.vs.m"])
qlf_GBR.f.vs.m <- glmQLFTest(fit, contrast = my.contrasts[, "GBR.f.vs.m"])
qlf_f.YRI.vs.GBR <- glmQLFTest(fit, contrast = my.contrasts[, "f.YRI.vs.GBR"])
qlf_m.YRI.vs.GBR <- glmQLFTest(fit, contrast = my.contrasts[, "m.YRI.vs.GBR"])
names(qlf_m.YRI.vs.GBR)
- 'coefficients'
- 'fitted.values'
- 'deviance'
- 'method'
- 'unshrunk.coefficients'
- 'df.residual'
- 'design'
- 'offset'
- 'dispersion'
- 'prior.count'
- 'AveLogCPM'
- 'df.residual.zeros'
- 'df.prior'
- 'var.post'
- 'var.prior'
- 'samples'
- 'table'
- 'comparison'
- 'df.test'
- 'df.total'
对每个差异分析分析结果的pvalue进行校正
YRI.f.vs.m_padjust <- p.adjust(qlf_YRI.f.vs.m$table$PValue, method = "BH")
GBR.f.vs.m_padjust <- p.adjust(qlf_GBR.f.vs.m$table$PValue, method = "BH")
f.YRI.vs.GBR_padjust <- p.adjust(qlf_f.YRI.vs.GBR$table$PValue, method = "BH")
m.YRI.vs.GBR_padjust <- p.adjust(qlf_m.YRI.vs.GBR$table$PValue, method = "BH")
提取差异表达的基因
YRI.f.vs.m.sig <- qlf_YRI.f.vs.m$table[which(YRI.f.vs.m_padjust < 0.05),]
write.csv(YRI.f.vs.m.sig, file = "DE/YRI.f.vs.m_0.05_edgeR.csv")
GBR.f.vs.m.sig <- qlf_GBR.f.vs.m$table[which(qlf_GBR.f.vs.m$table$PValue < 0.5),]
write.csv(GBR.f.vs.m.sig, file = "DE/GBR.f.vs.m_0.05_edgeR.csv")
HISAT2 ->StringTie -> Ballgown
参考文献:Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown
流程示意图
流程代码
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/hisat+stringtie
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/hisat2,$output/1_hisat,$output/2_stringtie,$output/3_ballgown,$output/4_DE}
fi
hisat=$output/1_hisat
stringtie=$output/2_stringtie
ballgown=$output/3_ballgown
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chrX.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引hisat2-build\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/hisat2|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
hisat2_extract_exons.py $gene >$index/hisat2/$name'.exon'
hisat2_extract_splice_sites.py $gene >$index/hisat2/$name'.ss'
hisat2-build --ss $index/hisat2/$name'.ss' --exon $index/hisat2/$name'.exon' $genome $index/hisat2/$name 1>$index/hisat2/index.log 2>&1
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::与参考基因组比对hisat2\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_hisat
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
index_prefix=`awk -v each=$(basename $genome) 'BEGIN{split(each,arr,".");print arr[1]}' `
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
hisat2 -p 8 --dta -x $index/hisat2/${index_prefix} -1 $fq1 -2 $fq2 -S $dir/${name}.sam &> $dir/${name}.hisat.log
samtools sort -@ 8 -o $dir/${name}.bam $dir/${name}.sam &> /dev/null && rm $dir/${name}.sam
printf "[%s %s %s %s %s %s]::%s比对完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比对耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
assemble(){
printf "[%s %s %s %s %s %s]::转录本组装stringtie\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_stringtie
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $output/1_hisat/*.bam;do
base_name=$(basename $id)
i=${base_name%.bam*}
read -u$1
{
stringtie -p 8 -G $gene -o $dir/${i}.gtf -l $i $output/1_hisat/${i}.bam &>> $dir/assemble.log
printf "[%s %s %s %s %s %s]::%s转录本组装完成\n" $(echo `date`) $i
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本组装耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
merge(){
printf "[%s %s %s %s %s %s]::转录本合并stringtie --merge\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_stringtie
find $dir -name *?gtf|grep -v '.*merge.gtf'|sort > $dir/mergelist.txt
source activate RNA
if [ ! -f $dir/stringtie_merge.gtf ];then
stringtie --merge -p 8 -G $gene -o $dir/stringtie_merge.gtf $dir/mergelist.txt
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本合并耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
count(){
printf "[%s %s %s %s %s %s]::转录本(基因)的定量:stringtie\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/3_ballgown
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $output/2_stringtie/mergelist.txt);do
base_name=$(basename $id)
name=${base_name%.gtf}
read -u$1
{
stringtie -B -p 8 -G $output/2_stringtie/stringtie_merge.gtf -o $dir/$name/$base_name $output/1_hisat/${name}.bam &>> $dir/count.log
printf "[%s %s %s %s %s %s]::%s转录本定量完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::转录本合并耗时%.2f分钟\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
index
thread 4 mapping
thread 6 assemble
merge
thread 6 count
差异表达分析ballgown
设置工作目录
就设置为当前目录下,新建一个DE目录,根据需求调整
!pwd
!mkdir DE
/root/python_code_scq/python_bio_file
mkdir: 无法创建目录"DE": 文件已存在
setwd("/root/python_code_scq/python_bio_file/DE")
suppressMessages(library(ballgown))
suppressMessages(library(genefilter))
suppressMessages(library(dplyr))
读取样本的表型数据
pheno_data=read.csv("/root/python_code_scq/python_bio_file/geuvadis_phenodata.csv")
pheno_data
ids | sex | population |
---|---|---|
ERR188044 | male | YRI |
ERR188104 | male | YRI |
ERR188234 | female | YRI |
ERR188245 | female | GBR |
ERR188257 | male | GBR |
ERR188273 | female | YRI |
ERR188337 | female | GBR |
ERR188383 | male | GBR |
ERR188401 | male | GBR |
ERR188428 | female | GBR |
ERR188454 | male | YRI |
ERR204916 | female | YRI |
读取定量分析的表达数据
bg_chrX = ballgown(dataDir = "../3_ballgown",samplePattern = "ERR", pData = pheno_data )
Sat Dec 29 10:56:05 2018
Sat Dec 29 10:56:05 2018: Reading linking tables
Sat Dec 29 10:56:06 2018: Reading intron data files
Sat Dec 29 10:56:07 2018: Merging intron data
Sat Dec 29 10:56:07 2018: Reading exon data files
Sat Dec 29 10:56:09 2018: Merging exon data
Sat Dec 29 10:56:10 2018: Reading transcript data files
Sat Dec 29 10:56:10 2018: Merging transcript data
Wrapping up the results
Sat Dec 29 10:56:10 2018
bg_chrX
ballgown instance with 3354 transcripts and 12 samples
过滤掉表达量低的基因
bg_chrX_filt=subset(bg_chrX,"rowVars(texpr(bg_chrX))>1",genomesubset=TRUE)
对转录本进行差异分析
results_transcripts=stattest(bg_chrX_filt,feature="transcript",covariate="sex",adjustvars=c("population"),getFC=TRUE,meas="FPKM")
对基因进行差异分析
results_genes=stattest(bg_chrX_filt,feature="gene",covariate="sex",adjustvars=c("population"),getFC=TRUE,meas="FPKM")
给转录本添加基因名和基因ID
results_transcripts=data.frame(geneNames=ballgown::geneNames(bg_chrX_filt),geneIDs=geneIDs(bg_chrX_filt),results_transcripts)
根据p-value值对分析结果进行排序(从小到大)
results_transcripts=arrange(results_transcripts,pval)
results_genes=arrange(results_genes,pval)
write.csv(results_transcripts, "chrX_transcript_results.csv",row.names=FALSE)
write.csv(results_genes, "chrX_gene_results.csv",row.names=FALSE)
提取显著差异表达的基因和转录本(q-value<0.05)
results_transcripts_0.05=subset(results_transcripts,results_transcripts$qval<0.05)
results_genes_0.05=subset(results_genes,results_genes$qval<0.05)
write.csv(results_transcripts_0.05,file="chrX_transcript_0.05.csv",row.names=FALSE)
write.csv(results_genes_0.05,file="chrX_genes_0.05.csv",row.names=FALSE)
数据可视化之颜色设定
tropical= c('darkorange', 'dodgerblue','hotpink', 'limegreen', 'yellow')
palette(tropical)
#当然rainbow()函数也可以完成这个任务
#palette(rainbow(5))
以FPKM为参考值作图,以性别作为区分条件
options(repr.plot.width = 9, repr.plot.height = 6)
fpkm = texpr(bg_chrX,meas="FPKM")
#方便作图将其log转换,+1是为了避免出现log2(0)的情况
fpkm = log2(fpkm+1)
# tiff(filename ="FPKM.tiff", compression = "lzw")
boxplot(fpkm,col=as.numeric(pheno_data$sex),las=2,ylab='log2(FPKM+1)')
# dev.off()
suppressMessages(library(reshape))
suppressMessages(library(ggplot2))
fpkm_L <- melt(fpkm)
tail(fpkm_L)
X1 | X2 | value | |
---|---|---|---|
40243 | 3349 | FPKM.ERR204916 | 0.00000000 |
40244 | 3350 | FPKM.ERR204916 | 3.18196839 |
40245 | 3351 | FPKM.ERR204916 | 0.04649665 |
40246 | 3352 | FPKM.ERR204916 | 0.00000000 |
40247 | 3353 | FPKM.ERR204916 | 0.00000000 |
40248 | 3354 | FPKM.ERR204916 | 0.00000000 |
colnames(fpkm_L)=c('n','sample','value')
head(fpkm_L)
n | sample | value |
---|---|---|
1 | FPKM.ERR188044 | 2.218154 |
2 | FPKM.ERR188044 | 0.000000 |
3 | FPKM.ERR188044 | 5.333466 |
4 | FPKM.ERR188044 | 0.000000 |
5 | FPKM.ERR188044 | 0.000000 |
6 | FPKM.ERR188044 | 8.949343 |
group_list=as.character(pheno_data$sex)
group_list
- 'male'
- 'male'
- 'female'
- 'female'
- 'male'
- 'female'
- 'female'
- 'male'
- 'male'
- 'female'
- 'male'
- 'female'
fpkm_L$group=rep(group_list,each=nrow(fpkm))
tail(fpkm_L)
n | sample | value | group | |
---|---|---|---|---|
40243 | 3349 | FPKM.ERR204916 | 0.00000000 | female |
40244 | 3350 | FPKM.ERR204916 | 3.18196839 | female |
40245 | 3351 | FPKM.ERR204916 | 0.04649665 | female |
40246 | 3352 | FPKM.ERR204916 | 0.00000000 | female |
40247 | 3353 | FPKM.ERR204916 | 0.00000000 | female |
40248 | 3354 | FPKM.ERR204916 | 0.00000000 | female |
options(repr.plot.width = 9, repr.plot.height = 5)
p=ggplot(fpkm_L,aes(x=sample,y=value,fill=group))+
geom_boxplot() +
theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1,size = 6))
p
#?graphics::boxplot
结果可视化
火山图
DE <- read.csv('DE/GBR.f.vs.m_0.05_edgeR.csv',row.names = 1)
tail(DE)
logFC | logCPM | F | PValue | |
---|---|---|---|---|
F8A2 | 1.7585269 | 1.446163 | 1.3961933 | 0.26242263 |
F8A3 | 2.0319356 | 3.642920 | 4.1728251 | 0.06200827 |
TMLHE | 0.2410015 | 8.211140 | 1.4443700 | 0.25097602 |
VAMP7 | -0.1153725 | 10.608449 | 0.6283750 | 0.44224975 |
IL9R | -0.4700823 | 6.023040 | 0.7402906 | 0.40523418 |
WASIR1 | -1.1378042 | 1.615207 | 0.4942440 | 0.49450021 |
DEG <- data.frame(DE$PValue,DE$logFC)
colnames(DEG) <- c('p','logFC')
rownames(DEG) <- rownames(DE)
tail(DEG)
p | logFC | |
---|---|---|
F8A2 | 0.26242263 | 1.7585269 |
F8A3 | 0.06200827 | 2.0319356 |
TMLHE | 0.25097602 | 0.2410015 |
VAMP7 | 0.44224975 | -0.1153725 |
IL9R | 0.40523418 | -0.4700823 |
WASIR1 | 0.49450021 | -1.1378042 |
示例数据量少,筛选出的差异基因也少,画图的效果不太好
options(repr.plot.width = 8, repr.plot.height = 6)
my_volcano <- function(DEG, t_p, t_FC = 0, t_marker){
## example: print(my_volcano(a[,c(5,1)], 0.01, 0.6, 0))
library(ggplot2)
DEG = na.omit(DEG)
colnames(DEG) = c('p','logFC')
DEG$gene <- rownames(DEG)
DEG[DEG$p < 1e-10, 'p'] = 1e-10
if (t_FC == 0){
t_FC <- with(DEG, mean(abs(logFC)) + 2 * sd(abs(logFC)))
}
DEG$change = as.factor(ifelse(DEG$p < t_p & abs(DEG$logFC) > t_FC,ifelse(DEG$logFC > t_FC, "UP", "DOWN"),"NOT"))
this_title <- paste0("Cutoff for logFC is ", round(t_FC, 3),
"\nThe number of up gene is ",
nrow(DEG[DEG$change == "UP", ]),
"\nThe number of down gene is ",
nrow(DEG[DEG$change == "DOWN", ])
)
p = ggplot(data = DEG, aes(x = logFC, y = -log10(p), color = change)) +
geom_point() +
scale_color_manual(values = c('blue', 'black', 'red')) +
geom_hline(yintercept = -log10(t_p), lty = 4, lwd = 0.6, alpha = 0.8) +
geom_vline(xintercept = c(t_FC, -t_FC), lty = 4, lwd = 0.6, alpha = 0.8) +
theme_bw() +
ylim(0,5) +
xlim(-10,10) +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) + ggtitle(this_title) +
labs(x = "log2(fold change)", y = "-log10(q-value)") +
theme(plot.title = element_text(hjust = 0.5))
if (t_marker != 0) {
DE <- subset(DEG, abs(logFC) > t_marker)
p = p + geom_text(data = DE , aes(label = gene), col = "green",alpha = 0.5)
}
return(p)
}
print(my_volcano(DEG, 0.05, 1, 4.2))
Warning message:
“Removed 1 rows containing missing values (geom_point).”Warning message:
“Removed 1 rows containing missing values (geom_text).”
热图层次聚类
# We estimate the variance for each row in the logcounts matrix
var_genes <- apply(logcounts, 1, var)
head(var_genes)
- PLCXD1
- 0.412417349394376
- GTPBP6
- 0.236517004810871
- PPP2R3B
- 0.452153668390167
- CRLF2
- 0.271975933288722
- CSF2RA
- 2.64355917157874
- IL3RA
- 0.462801417616373
# Get the gene names for the top 50 most variable genes
select_var <- names(sort(var_genes, decreasing=TRUE))[1:50]
head(select_var)
- 'XIST'
- 'EGFL6'
- 'NLGN4X'
- 'CT45A3'
- 'LONRF3'
- 'KLHL13'
# Subset logcounts matrix
highly_variable_lcpm <- logcounts[select_var,]
dim(highly_variable_lcpm)
- 50
- 12
my_heatmap <- function(countdata,coldata){
morecols <- colorRampPalette(c("red","black","green"))
col.sex <- c("green","red")[coldata$sex]
label = paste(coldata$sex ,'.', coldata$population)
# Plot the heatmap
heatmap.2(countdata,labCol=label,srtCol=15,
col=rev(morecols(50)),trace="none", main="Top 50 most variable genes across samples",
ColSideColors=col.sex,colCol=col.sex,scale="row")
}
my_heatmap(highly_variable_lcpm, coldata)