python编辑距离

import numpy as np
import json
import codecs

# 计算编辑距离
def edit_distance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    dp = np.zeros((len1 + 1, len2 + 1))
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if word1[i - 1] == word2[j - 1]:
                temp = 0
            else:
                temp = 1
            dp[i][j] = min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
    return dp[len1][len2]


# 190801
# 根据编辑距离计算相似度
def simility(word1, word2):
    res = edit_distance(word1, word2)
    maxLen = max(len(word1), len(word2))
    return 1-res*1.0/maxLen

bianhaos = []
sub_sens = []
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write_sub.txt','r','utf8') as f:
    for line in f:
        # bianhao,sub_sen = line.split('<->')
        # sub_sen = sub_sen.strip().strip('<b>').strip('<e>')
        # bianhaos.append(bianhao)
        sub_sens.append(line)
count = len(sub_sens)
leibie = [-1]*count
cla = 0
print(count)
for i in range(count):
    if leibie[i] != -1:
        continue
    leibie[i] = cla
    sub1 = sub_sens[i]
    for j in range(count):
        if leibie[j] != -1:
            continue
        sub2 = sub_sens[j]
        sim = simility(sub1,sub2)
        if sim >= 0.5:
            leibie[j] = cla
    cla = cla + 1
    print(i)
print(leibie)
with open('leibie05.json','w') as f:
    json.dump(leibie,f)

猜你喜欢

转载自www.cnblogs.com/yiwoqu/p/11542074.html