生信自学笔记(七)Smith Waterman算法的 python 实现

和上一回的算法相比,这是一个更加注重局部比对的算法,返回的是一段对比度最高的序列。其最精彩之处就在于引入了0这一可能,这样在任何时候,序列比较都能从当前位置从头开始,从而实现局部性。

为了能更有效率地完成回溯,在给矩阵打分时将打分结果记录在了几个数组中,并且在回溯时使用这几个数组加速过程。

这次写又掌握了numpy的许多新功能,比如zip、around、arange等等。具体用法可以看下面的代码。其中L矩阵是直接比对的情况,P是对seq2加空格比对的情况,Q是对seq1加空格比对的情况。

import numpy as np

# 对任意两个字符进行比较
def compare(m, n, match, n_match):
    if m == n:
        return match
    else:
        return n_match

#构建打分矩阵
def mark(seq1, seq2, match, n_match, v, u):
    "用来生成最终的打分矩阵,传入形参分别为两个序列和匹配罚分、错配罚分、空位设置罚值,空位扩展罚值(出现空位的罚值为v+u*k)"
    a = len(seq1)
    b = len(seq2)
    # 初始化打分矩阵
    S = np.zeros((a + 1, b + 1))
    # 顺序匹配的情况
    L = np.zeros((a + 1, b + 1))
    # 序列1产生若干空位情况
    P = np.zeros((a + 1, b + 1))
    # 序列2产生若干空位情况
    Q = np.zeros((a + 1, b + 1))
    # 在字符串最前添加前导空格
    seq1 = " " + seq1[:]
    seq2 = " " + seq2[:]
    for r in range(1, b+1 if a>b else a+1):
        for c in range(r, b+1):
            L[r,c] = S[r-1,c-1]+compare(seq1[r],seq2[c],match,n_match)
            P[r,c] = max(np.add(S[0:r,c],-(np.arange(r,0,-1)*u+v)))
            Q[r,c] = max(np.add(S[r,0:c],-(np.arange(c,0,-1)*u+v)))
            S[r,c] = max([0,L[r,c],P[r,c],Q[r,c]])
        for c in range(r+1, a+1):
            L[c, r] = S[c-1, r-1]+compare(seq1[c], seq2[r],match,n_match)
            P[c, r] = max(np.add(S[0:c, r], -(np.arange(c,0,-1) * u + v)))
            Q[c, r] = max(np.add(S[c, 0:r], -(np.arange(r,0,-1) * u + v)))
            S[c, r] = max([0, L[c, r], P[c, r], Q[c, r]])
    return S,P,Q,L



def traceback(seq1, seq2, v, u, S, current_x, current_y, S1, S2, t1, t2, L, P, Q):
    print(t1, " ", t2, " ", current_x, " ", current_y)
    if current_x == 1 and current_y == 1:
        S1.append(seq1[1] + t1[:])
        S2.append(seq2[1] + t2[:])
        return
    if S[current_x,current_y] == L[current_x,current_y]:
        print("first")
        traceback(seq1, seq2, v, u, S, current_x - 1, current_y - 1, S1, S2, seq1[current_x] + t1[:], seq2[current_y] + t2[:], L, P, Q)
    if S[current_x,current_y] == P[current_x,current_y]:
        step = np.add(S[0:current_x,current_y],-(np.arange(current_x,0,-1)*u+v))
        step = list(step)
        step = current_x-step.index(P[current_x, current_y])
        print("second", step, P[current_x,current_y])
        traceback(seq1, seq2, v, u, S, current_x - step, current_y, S1, S2, seq1[current_x]+t1[:], "-"*step+t2[:], L, P, Q)
    if S[current_x,current_y] == Q[current_x,current_y]:
        step = np.add(S[current_x, 0:current_y], -(np.arange(current_y, 0, -1) * u + v))
        step = list(step)
        step = current_y - step.index(Q[current_x, current_y])
        print("third", step)
        traceback(seq1, seq2, v, u, S, current_x, current_y-step, S1, S2, "-"*step+t1[:], seq2[current_y]+t2[:], L, P, Q)


# 一定要写1/3,而不要写成0.33
S, P, Q, L = mark("CTATAATCCC", "CTGTATC", 1, -1/3, 1, 1/3)
# 对整个矩阵进行舍入运算
print("S = ",np.around(S,2))
print("P = ",np.around(P,2))
print("Q = ",np.around(Q,2))
print("L = ",np.around(L,2))

S1 = []
S2 = []
t1 = ""
t2 = ""
R = np.where(S == np.max(S))
# 这是同时遍历两个列表的方法
for x, y in zip(R[0],R[1]):
    traceback(" CTATAATCCC", " CTGTATC", 1, 1 / 3, S, x, y, S1, S2, t1, t2, L, P, Q)
print(S1)
print(S2)







猜你喜欢

转载自blog.csdn.net/jining11/article/details/81462723