新建文件夹
mkdir 13.secreted_protein && cd 13.secreted_protein
# 在(pfam_scan)下
conda activate pfam_scan
protein.fa:/media/aa/DATA/SZQ2/bj/functional_annotation/pep70_noStar/$i.noStar.fasta
protein.fa:/media/aa/DATA/SZQ2/bj/functional_annotation/pepmy_noStar/$i.noStar.fasta
第1步SignalP:进行信号肽分析。分泌蛋白都具有信号肽。
新建文件夹
mkdir singalp && cd signalp
signalp5 -batch 30000 -org euk -fasta /media/aa/DATA/SZQ2/bj/functional_annotation/pep70/Amath1.fasta -gff3 -mature
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "signalp5 -batch 30000 -org euk -fasta /media/aa/DATA/SZQ2/bj/functional_annotation/pep70/$i.fasta -gff3 -mature"
done > command.signalp.list
ParaFly -c command.signalp.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "signalp5 -batch 30000 -org euk -fasta /media/aa/DATA/SZQ2/bj/functional_annotation/pepmy/$i.fasta -gff3 -mature"
done > command.signalp.list
ParaFly -c command.signalp.list -CPU 48
第2步TMHMM:进行跨膜区分析。若具有跨膜区,则蛋白会和膜进行结合,从而固定到膜上,不会成为分泌蛋白。
新建文件夹
mkdir TMHMM && cd TMHMM
1)tmhmm
tmhmm /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/Amath1_mature.fasta > Amath1.tmhmm.out
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "tmhmm /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/singalp/$i_mature.fasta > $i.tmhmm.out"
done > command.tmhmm.list
ParaFly -c command.tmhmm.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "tmhmm /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pepmy/signalp/$i_mature.fasta > $i.tmhmm.out"
done > command.tmhmm.list
ParaFly -c command.tmhmm.list -CPU 48
2)grep
grep "Number of predicted TMHs: 0" Amath1.tmhmm.out | perl -p -e 's/#\s+(\S+).*/$1/' > Amath1.genes_without_TMHs.list
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "grep "Number of predicted TMHs: 0" $i.tmhmm.out | perl -p -e 's/#\s+(\S+).*/$1/' > $i.genes_without_TMHs.list"
done > command.grep.list
ParaFly -c command.grep.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "grep "Number of predicted TMHs: 0" $i.tmhmm.out | perl -p -e 's/#\s+(\S+).*/$1/' > $i.genes_without_TMHs.list"
done > command.grep.list
ParaFly -c command.grep.list -CPU 48
3)fasta_extract_subseqs_from_list.pl
/media/aa/DATA1/bin/fasta_extract_subseqs_from_list.pl /media/aa/DATA/SZQ2/bj/functional_annotation/pep70/Amath1.fasta Amath1.genes_without_TMHs.list > Amath1.candidate_secreted_proteins.fasta
结果保存在$i.candidate_secreted_proteins_results.txt
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl /media/aa/DATA/SZQ2/bj/functional_annotation/pep70/$i.fasta $i.genes_without_TMHs.list > $i.candidate_secreted_proteins.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl /media/aa/DATA/SZQ2/bj/functional_annotation/pepmy/$i.fasta $i.genes_without_TMHs.list > $i.candidate_secreted_proteins.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
第3步PredGPT:分析GPI锚定位点。GPI锚定蛋白和膜结合,从而固定到膜上,不会成为分泌蛋白。
mkdir PredGPI && cd PredGPI
通过网页进行(链接如下):
一次最多允许提交500条序列。
将candidate_secreted_proteins.fasta分成candidate_secreted_proteins1.fasta和candidate_secreted_proteins2.fasta等多个文件。
提交后点击download。不填写邮箱!
结果文件是fasta格式文件,其头部包含结果信息。
PredGPI.fasta
取阈值FDR 0.5%。
FDR <= 0.1% GPI-anchored: highly probable
FDR <= 0.5% GPI-anchored: probable
FDR <= 1.0% GPI-anchored: lowly probable
得到GPIPE_query_results__tmp_tmp-h5nwU.txt和GPIPE_query_results__tmp_tmpPAyY71.txt等文件。
1)合并这些结果文件
cat GPIPE_query_results__tmp_tmp-h5nwU.txt GPIPE_query_results__tmp_tmpPAyY71.txt > $i.GPIPE_query_results__tmp.txt
2)复制成fasta格式
cp $i.GPIPE_query_results__tmp.txt $i.PredGPI.fasta
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "cp $i.GPIPE_query_results__tmp.txt $i.PredGPI.fasta"
done > command.cp.list
ParaFly -c command.cp.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "cp $i.GPIPE_query_results__tmp.txt $i.PredGPI.fasta"
done > command.cp.list
ParaFly -c command.cp.list -CPU 48
3)perl
perl -e 'while (<>) { if (m/^>(\S+).*FPrate:(\S+)/ && $2 <= 0.01) { print "$1\n"; } }' $i.PredGPI.fasta > $i.GPI_gene.list
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "perl -e 'while (<>) { if (m/^>(\S+).*FPrate:(\S+)/ && $2 <= 0.01) { print "$1\n"; } }' $i.PredGPI.fasta > $i.GPI_gene.list"
done > command.perl.list
ParaFly -c command.perl.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "perl -e 'while (<>) { if (m/^>(\S+).*FPrate:(\S+)/ && $2 <= 0.01) { print "$1\n"; } }' $i.PredGPI.fasta > $i.GPI_gene.list"
done > command.perl.list
ParaFly -c command.perl.list -CPU 48
4)fasta_extract_subseqs_from_list.pl
新建文件夹
mkdir NO_GPI70 && cd NO_GPI70
/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl --reverse $i.candidate_secreted_proteins.fasta $i.GPI_gene.list > $i.candidate_secreted_proteins.NO_GPI.fasta
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl --reverse /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pep70/TMHMM/$i.candidate_secreted_proteins.fasta ../$i.GPI_gene.list > $i.candidate_secreted_proteins.NO_GPI.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl --reverse /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pepmy/TMHMM/$i.candidate_secreted_proteins.fasta ../$i.GPI_gene.list > $i.candidate_secreted_proteins.NO_GPI.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
第4步BUSCA:进行亚细胞定位,选取定位到胞外的蛋白作为分泌蛋白基因。
新建文件夹
mkdir BUSCA && cd BUSCA
通过网页进行(链接如下):
一次最多允许提交500条序列。要用不带“*”的文件。
把candidate_secreted_proteins.NO_GPI.fasta分成candidate_secreted_proteins.NO_GPIa.fasta和candidate_secreted_proteins.NO_GPIb.fasta等多个文件。
选择Fungi类型,点击Start prediction。
下载表格格式结果:BUSCA_JOB_eb6d87ab-b921-4c04-89f1-3cf018a63898.csv和BUSCA_JOB_43a0a87d-bebe-45a7-a368-7238c1c47621.csv
1)合并两者
cat BUSCA_JOB_eb6d87ab-b921-4c04-89f1-3cf018a63898.csv BUSCA_JOB_43a0a87d-bebe-45a7-a368-7238c1c47621.csv > $i.BUSCA.out.csv
perl -e '<>; while (<>) { @_ = split /,/; $stats{$_[2]}{$_[0]} = 1; } foreach (sort keys %stats) { @gene = sort keys %{$stats{$_}}; my $gene_number = 0; $gene_number = @gene; print STDERR "$_\t$gene_number\n"; if ($_ eq "C:extracellular space") { foreach (@gene) { print "$_\n"; } } }' $i.BUSCA.out.csv > $i.extracellular_gene.list 2> $i.BUSCA.out.csv.stats
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "cp $i.BUSCA.out.csv > $i.extracellular_gene.list 2> $i.BUSCA.out.csv.stats"
done > command.perl.list
# 用Notepad打开替换掉cp
ParaFly -c command.perl.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "cp $i.BUSCA.out.csv > $i.extracellular_gene.list 2> $i.BUSCA.out.csv.stats"
done > command.perl.list
# 用Notepad打开替换掉cp
ParaFly -c command.perl.list -CPU 48
2) 需要把$i.extracellular_gene.list中的3_15188_3_15188替换成3_15188|3_15188
sed -i ‘s/$i_$i/$i|$i/g’ $i.extracellular_gene.list
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "sed 's/$i||$i/$i|$i/g' $i.extracellular_gene.list > $i.extracellular_gene.list.tmp"
done > command.sed.list
# 手动修改把“||”替换成“_”
ParaFly -c command.sed.list -CPU 48
3)复制并重命名
cp $i.extracellular_gene.list.tmp $i.extracellular_gene.list
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "cp extracellular_gene/$i.extracellular_gene.list.tmp $i.extracellular_gene.list"
done > command.cp.list
ParaFly -c command.cp.list -CPU 48
4)fasta_extract_subseqs_from_list.pl
/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl $i.candidate_secreted_proteins.NO_GPI.fasta $i.extracellular_gene.list > $i.candidate_secreted_proteins.NO_GPI.extracellular.fasta
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pep70/PredGPI/$i.candidate_secreted_proteins.NO_GPI.fasta $i.extracellular_gene.list > $i.candidate_secreted_proteins.NO_GPI.extracellular.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "/media/aa/DATA2/bin/fasta_extract_subseqs_from_list.pl /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pepmy/BUSCA/NO_GPImy/$i.candidate_secreted_proteins.NO_GPI.fasta $i.extracellular_gene.list > $i.candidate_secreted_proteins.NO_GPI.extracellular.fasta"
done > command.fasta_extract_subseqs_from_list.list
ParaFly -c command.fasta_extract_subseqs_from_list.list -CPU 48
5)得到最终结果
新建文件夹
mkdir secreted_proteins && cd secreted_proteins
cat $i.candidate_secreted_proteins.NO_GPI.extracellular.fasta > $i.secreted_proteins.fasta
分泌蛋白最终结果是 $i.secreted_proteins.fasta
批量操作
批量 pep70
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "cat /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pep70/BUSCA/$i.candidate_secreted_proteins.NO_GPI.extracellular.fasta > $i.secreted_proteins.fasta"
done > command.cat.list
ParaFly -c command.cat.list -CPU 48
批量 pepmy
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "cat /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pepmy/BUSCA/$i.candidate_secreted_proteins.NO_GPI.extracellular.fasta > $i.secreted_proteins.fasta"
done > command.cat.list
ParaFly -c command.cat.list -CPU 48
分泌蛋白最终结果是 $i.secreted_proteins.fasta
Ectomycorrhizal ecology is imprinted in the genome of the dominant symbiotic fungus Cenococcum geophilum | Nature Communications分泌组分为四个功能类别: CAZymes,脂肪酶,蛋白酶和小分泌蛋白(SSP; 蛋白质小于300个氨基酸长度没有 CAZyme,蛋白酶或脂肪酶结构域)。Ectomycorrhizal ecology is imprinted in the genome of the dominant symbiotic fungus Cenococcum geophilum | Nature Communications
https://nph.onlinelibrary.wiley.com/doi/10.1111/nph.16032
小分泌蛋白(SSP)被定义为含有 < 300个氨基酸的蛋白质,具有信号肽,细胞外定位,没有跨膜结构域。通过 SignalP 4.1(Petersen 等,2011)以“真核”选项对 < 300个氨基酸长的蛋白质进行信号肽预测。使用 WoLF Psort 0.2(Horton 等,2007)和选项“真菌”检查具有细胞外信号肽的蛋白质的细胞外定位,并使用 Tmhm 2.0(Krogh 等,2001)检查跨膜螺旋的存在。
Frontiers | Comparative Analysis of Secretomes from Ectomycorrhizal Fungi with an Emphasis on Small-Secreted Proteins真菌分泌组由几种蛋白质类别组成,包括蛋白酶,脂肪酶,碳水化合物活性酶(CAZymes) ,未知功能的分泌蛋白和小分泌蛋白(SSP)(Alfaro 等,2014)。这些分泌的蛋白质或者通过水解酶如 CAZymes (Zhao et al.,2014) ,蛋白酶或脂肪酶参与有机物降解,或者通过表面蛋白如疏水蛋白(Linder et al.,2005)或 SSP (Martin and Kamoun,2011; van oij,2011)与宿主相互作用。
SSP 的数量与分泌组的大小呈正相关,最大的分泌组具有最高的 SSP 数量(补充图1B)。
为了鉴定28种腐生菌(白腐菌,棕腐菌和土壤和凋落物衰变菌)与14种菌根真菌(兰花,类环孢菌和 ECM 共生菌)之间共享的保守 SSP,我们使用 CD-HIT 软件进行了基于序列同一性的聚类分析,同一性阈值设置为70% (图7)。
小分泌蛋白SSP
小分泌蛋白(SSP): 含有 < 300个氨基酸的蛋白质,具有信号肽,细胞外定位,没有跨膜结构域。
# 保存>0且<300长度的序列
seqkit seq -m 0 -M 299 ../secreted_proteins/*.secreted_proteins.fasta > *.SSP.fasta
# pep70
cd /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pep70/SSP
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/70list.txt`
do
echo "seqkit seq -m 0 -M 299 ../secreted_proteins/$i.secreted_proteins.fasta > $i.SSP.fasta"
done > command.seqkit.list
ParaFly -c command.seqkit.list -CPU 4
# pepmy
cd /media/aa/DATA/SZQ2/bj/functional_annotation/13.secreted_protein/pepmy/SSP
for i in `cat /media/aa/DATA/SZQ2/bj/functional_annotation/pepmylist.txt`
do
echo "seqkit seq -m 0 -M 299 ../secreted_proteins/$i.secreted_proteins.fasta > $i.SSP.fasta"
done > command.seqkit.list
ParaFly -c command.seqkit.list -CPU 4
# 统计
seqkit stat *.SSP.fasta
# 新建文件夹粘贴统计结果
touch pep70_stat.txt
touch pepmy_stat.txt