和上一回的算法相比,这是一个更加注重局部比对的算法,返回的是一段对比度最高的序列。其最精彩之处就在于引入了0这一可能,这样在任何时候,序列比较都能从当前位置从头开始,从而实现局部性。
为了能更有效率地完成回溯,在给矩阵打分时将打分结果记录在了几个数组中,并且在回溯时使用这几个数组加速过程。
这次写又掌握了numpy的许多新功能,比如zip、around、arange等等。具体用法可以看下面的代码。其中L矩阵是直接比对的情况,P是对seq2加空格比对的情况,Q是对seq1加空格比对的情况。
import numpy as np
# 对任意两个字符进行比较
def compare(m, n, match, n_match):
if m == n:
return match
else:
return n_match
#构建打分矩阵
def mark(seq1, seq2, match, n_match, v, u):
"用来生成最终的打分矩阵,传入形参分别为两个序列和匹配罚分、错配罚分、空位设置罚值,空位扩展罚值(出现空位的罚值为v+u*k)"
a = len(seq1)
b = len(seq2)
# 初始化打分矩阵
S = np.zeros((a + 1, b + 1))
# 顺序匹配的情况
L = np.zeros((a + 1, b + 1))
# 序列1产生若干空位情况
P = np.zeros((a + 1, b + 1))
# 序列2产生若干空位情况
Q = np.zeros((a + 1, b + 1))
# 在字符串最前添加前导空格
seq1 = " " + seq1[:]
seq2 = " " + seq2[:]
for r in range(1, b+1 if a>b else a+1):
for c in range(r, b+1):
L[r,c] = S[r-1,c-1]+compare(seq1[r],seq2[c],match,n_match)
P[r,c] = max(np.add(S[0:r,c],-(np.arange(r,0,-1)*u+v)))
Q[r,c] = max(np.add(S[r,0:c],-(np.arange(c,0,-1)*u+v)))
S[r,c] = max([0,L[r,c],P[r,c],Q[r,c]])
for c in range(r+1, a+1):
L[c, r] = S[c-1, r-1]+compare(seq1[c], seq2[r],match,n_match)
P[c, r] = max(np.add(S[0:c, r], -(np.arange(c,0,-1) * u + v)))
Q[c, r] = max(np.add(S[c, 0:r], -(np.arange(r,0,-1) * u + v)))
S[c, r] = max([0, L[c, r], P[c, r], Q[c, r]])
return S,P,Q,L
def traceback(seq1, seq2, v, u, S, current_x, current_y, S1, S2, t1, t2, L, P, Q):
print(t1, " ", t2, " ", current_x, " ", current_y)
if current_x == 1 and current_y == 1:
S1.append(seq1[1] + t1[:])
S2.append(seq2[1] + t2[:])
return
if S[current_x,current_y] == L[current_x,current_y]:
print("first")
traceback(seq1, seq2, v, u, S, current_x - 1, current_y - 1, S1, S2, seq1[current_x] + t1[:], seq2[current_y] + t2[:], L, P, Q)
if S[current_x,current_y] == P[current_x,current_y]:
step = np.add(S[0:current_x,current_y],-(np.arange(current_x,0,-1)*u+v))
step = list(step)
step = current_x-step.index(P[current_x, current_y])
print("second", step, P[current_x,current_y])
traceback(seq1, seq2, v, u, S, current_x - step, current_y, S1, S2, seq1[current_x]+t1[:], "-"*step+t2[:], L, P, Q)
if S[current_x,current_y] == Q[current_x,current_y]:
step = np.add(S[current_x, 0:current_y], -(np.arange(current_y, 0, -1) * u + v))
step = list(step)
step = current_y - step.index(Q[current_x, current_y])
print("third", step)
traceback(seq1, seq2, v, u, S, current_x, current_y-step, S1, S2, "-"*step+t1[:], seq2[current_y]+t2[:], L, P, Q)
# 一定要写1/3,而不要写成0.33
S, P, Q, L = mark("CTATAATCCC", "CTGTATC", 1, -1/3, 1, 1/3)
# 对整个矩阵进行舍入运算
print("S = ",np.around(S,2))
print("P = ",np.around(P,2))
print("Q = ",np.around(Q,2))
print("L = ",np.around(L,2))
S1 = []
S2 = []
t1 = ""
t2 = ""
R = np.where(S == np.max(S))
# 这是同时遍历两个列表的方法
for x, y in zip(R[0],R[1]):
traceback(" CTATAATCCC", " CTGTATC", 1, 1 / 3, S, x, y, S1, S2, t1, t2, L, P, Q)
print(S1)
print(S2)