使用python后跟着-c这个命令来运行字符串
/usr/bin/python -c 'print "Hello World"'
Hello World
字符在ASCII中对应的数值
/usr/bin/python -c 'print ord("A")'
65
整数的在ASCII中对应的字符
/usr/bin/python -c "print chr(65)"
A
找到字符I对应的Illumina 1.5 (+64)编码质量值
/usr/bin/python -c 'print ord("I") - 64'
9
找到40这个值对应的Illumina 1.5 (+64)编码质量字符
/usr/bin/python -c 'print chr(40 + 64)'
h
phred质量体系
不同版本质量得分与质量字符ASCII值的关系
ASCII值小于等于58(相应的质量得分小于等于25)对应的字符只有在Phred+33的编码中被使用,所有Phred+64所使用的字符的ASCII值都大于等于59。在通常情况下,ASCII值大于等于74的字符只出现在Phred+64中。利用这些信息即可在程序中进行判断
python脚本判断编码及相互转换
import time
import argparse
def sign(func):
"""作者签名"""
def warpper(*args, **kwargs):
name = "The author\'s Sunchengquan"
print(name.center(50, "#"))
courrent_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print(courrent_time.center(50, "#")+'\n')
func(*args, **kwargs)
return warpper
def timer(func):
def wrapper(*args, **kwargs):
start= time.time()
func(*args, **kwargs)
print(str("program consume:%.2fs" %(time.time()-start)).center(50, "#"))
return wrapper
def fq_Phred_check(inputfile):
'判断fastq序列编码是Phred33(Illumina1.8+) or Phred64(Illumina1.3+)'
fastq_file = open(inputfile,'r',encoding='utf-8')
i = 1
ASCII_list = []
for line in fastq_file:
if (i % 4 == 0) & (i <= 10):
line_strip = line.strip()
for each in line_strip:
ASCII_each = ord(each)
if ASCII_each >= 75:
ASCII_list.append(ASCII_each)
i = i + 1
fastq_file.close()
if len(ASCII_list) == 0:
phred_value = 33
print('The fastq Phred value should be Phred 33'.center(50, "#"))
else:
phred_value = 64
print('The fastq Phred value should be Phred 64'.center(50, "#"))
return phred_value
def fq_Phred_change(inputfile):
'fastq Phred33 与 phred 64 之间转换'
fastq_file = open(inputfile,'r',encoding='utf-8')
fq_phred = fq_Phred_check(inputfile)
if int(fq_phred) == 64:
out_file_name = inputfile.split('.')[0] + '_phred' + str(fq_phred-31)+'.fastq'
else:
out_file_name = inputfile.split('.')[0] + '_phred' + str(fq_phred +31) + '.fastq'
outfile = open(out_file_name,'w',encoding='utf-8')
i = 1
if int(fq_phred) == 64:
for line in fastq_file:
line_strip = line.strip()
if i % 4 == 0:
temp = []
for each in line_strip:
phred_value = ord(each) - 64 + 33
print(phred_value)
each_phred = str(chr(phred_value))
temp.append(each_phred)
print(temp)
Phred33_value = ''.join(temp)
outfile.write(Phred33_value + '\n')
else:
outfile.write(line_strip + '\n')
i = i + 1
print("phred64 transform Phred33 success".center(50, "#"))
elif int(fq_phred) == 33:
for line in fastq_file:
line_strip = line.strip()
if i % 4 == 0:
temp = []
for each in line_strip:
phred_value = ord(each) - 33 + 64
each_phred = chr(phred_value)
temp.append(each_phred)
Phred33_value = ''.join(temp)
outfile.write(Phred33_value + '\n')
else:
outfile.write(line_strip + '\n')
i = i + 1
print("phred33 transform Phred64 success".center(50, "#"))
fastq_file.close()
outfile.close()
@sign
@timer
def main():
# parser=argparse.ArgumentParser(description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter,
# epilog='author:\t{0}\nmail:\t{1}'.format(__author__,__mail__))
# parser.add_argument("--input",required=True,help="inputfile")
# argv=vars(parser.parse_args())
# inputfile = argv["input"]
inputfile = 'data/test1.fq'
fq_Phred_change(inputfile)
if __name__ == '__main__':
main()
python脚本fastq转换fasta格式
def fastq_fasta(inputfile):
'将fastq转换成fasta序列格式'
fastq_file = open(inputfile,'r',encoding='utf-8')
out_file_name = inputfile.split('.')[0] + '.fasta'
output_fasta = open(out_file_name,'w',encoding='utf-8')
i = 0
for line in fastq_file:
if i % 4 == 0:
line_new = line.strip().split('\n')[0].replace('@','>')
output_fasta.write(line_new + '\n')
elif (i - 1) % 4 == 0:
output_fasta.write(line)
i = i + 1
print("fastq transform fasta success".center(50, "#"))
fastq_file.close()
output_fasta.close()
def main():
inputfile = 'data/test1.fq'
fastq_fasta(inputfile)
if __name__ == '__main__':
main()
从这个课程的网址中获取一个测试数据集
curl -O http://personal.psu.edu/iua1/courses/files/2014/illumina-data.fq.gz
你可以解压这个gzip文件 illumina-data.fq.gz ,也可以不解压,并用zcat (Linux)来打开。当你有很多的数据集时,牺牲读取速度来节省存储也是值得的
time zcat illumina-data.fq.gz > tmp
解压数据,同时保留原压缩文件
gunzip -c illumina-data.fq.gz > illumina-data.fq
time cat illumina-data.fq > tmp
查看FASTQ记录
zcat illumina-data.fq.gz | head -4
@HWI-ST1342:96:H0NP9ADXX:2:1115:13393:59201
CCCATCTTAATCGATCATCAGCTCGAAGTGCACCGATCCGAGTCGCTCGTGTCTGTCGTTATCGTGCCGGACGTTGTAGGCCCAAGCTTTGCACTCAACGTTGATGATGATTCCACGGACTGGGCGCTCGAAGTGTACCGCAACCAGCGG
+
CCCFFFFFHHHGHJIJIIJIJJJJJJIIFGIJJJIJJIJGIJIGEGGIIAEHHHFFFFFDACCDDDDDDDB>B@BD?CDDCDDDDDDDDCDDDDDDACDBBABDDDEEDEEDEEEDDDDDDDDDDDDDDDDDDBCCDEDDDDDDDDDDDD
简单粗暴地检查一下质量,在前10,000行里是否有连续的###字符
zcat illumina-data.fq.gz | head -10000 | grep '###' | wc -l
在数据里搜索pattern并着色
zcat illumina-data.fq.gz | head -10000 | grep 'ATG' --color=always | head
python提取fastq文件中的序列
"""
写程序 grepFastq.py,
提取fastq.name中名字对应的test1.fq的序列。
fastq.name
HWI-ST1223:80:D1FMTACXX:2:1101:1243:2213 1:N:0:AGTCAA
"""
seqFile = "data/test1.fq"
nameFile = "data/fastq.name"
aDict = {}
count = 0
for line in open(seqFile):
if count % 4 == 0:
name = line.strip()[1:]
aDict[name] = [line]
else:
aDict[name].append(line)
count += 1
for line in open(nameFile):
name = line.strip()
print(''.join(aDict[name]))
@HWI-ST1223:80:D1FMTACXX:2:1101:1243:2213 1:N:0:AGTCAA
TCTGTGTAGCCNTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCATGCACCACCACNNNCGGCTCATTTGTCTTTNNTTTTTGTTTTGTTCTGTA
+
BCCFFFFFFHH#4AFHIJJJJJJJJJJJJJJJJJIJIJJJJJGHIJJJJJJJJJJJJJIIJ###--5ABECFFDDEEEEE##,5=@B8?CDD<AD>C:@>