今天遇见一个问题:要进行字符串匹配。
实际业务:比较N个JSON串和一行文本的相似度。并选择最像的M个JSON。
业务逻辑和这个一致:
我这里有100个数字
然后另外一个表格请找出表格1和表格2相同的数据、相似度并定位。
(丢几个数据不重要,重要的是相似度)
我XX日了狗了。
————————————————————————
写个脚本定位下:
def get_list2(data, line=2):
data = data.replace('\n', '')
data = str(data)
data_list =[]
if data == '':
data ='¥¥'
for i in range(0, int(len(data)/line)):
part = data[i*line:i*line+line]
data_list.append(part)
return data_list
file1 = open('l1.txt', 'r', encoding='gbk')
file2 = open('l2.txt', 'r', encoding='gbk',errors='replace')
data1 = file1.read()
data2 = file2.read()
file1.close()
file2.close()
count = []
key_list = []
list1 = data1.split("/>")
list2 = get_list2(data2)
# print(list1)
# print(list2)
for part in list1:
count_num = 0
key_part = []
for key in list2:
if part.find(key) != -1:
count_num = count_num+1
key_part.append(key)
key_part_all = "".join(key_part)
count.append(count_num)
key_list.append(key_part_all)
print(len(count))
print(len(list1))
for i in range(len(count)):
for j in range(len(count)-i-1):
if count[j] < count[j+1]:
count[j], count[j+1] = count[j+1], count[j]
list1[j], list1[j+1] = list1[j+1], list1[j]
key_list[j], key_list[j+1] = key_list[j+1], key_list[j]
print(list1[0])
print('**************************************************************************')
print(list1[1])
print('--------------------------------------------------------------------------')
print("最符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[0], key_list[0], count[0]))
print('\n*******************\n')
print("比较符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[1], key_list[1], count[0]))
第一:把JSON等保存为Dbunit。(让他们丢失格式,可以分开)
然后把字符串拼接到l1.txt。
然后需要匹配的字符串为l2.txt。
然后将要匹配的字符串给拆解成各种2、3个的小字符串,然后和各个列表进行比较。
写完发现:虽然业务逻辑各种简陋,但是挺管用的。
凑合着用吧。
优化了一丢丢算法:
import time
import sys
file1_root = r'F:\d1\getmessage\l1.txt'
file2_root = r'F:\d1\getmessage\l2.txt'
def get_char(end='end', end_char=''):
data = 'char_is_null'
while 1:
var = input()
if var == str(end):
break
elif end_char != '' and var.find(end_char) != -1:
var = var[0:var.find(end_char)]
data = '{}\n{}'.format(data, var)
break
else:
data = '{}\n{}'.format(data, var)
return data.replace('char_is_null\n', '')
def get_list2(data, line=2):
data = data.replace('\n', '')
data = str(data)
data_list =[]
if data == '':
data = '**************************'
for i in range(0, int(len(data)/line)):
part = data[i*line:i*line+line]
data_list.append(part)
return data_list
file1 = open(file1_root, 'a+', encoding='gbk', errors='replace')
file1.close()
file2 = open(file2_root, 'a+', encoding='gbk', errors='replace')
file2.close()
while 1:
file1 = open(file1_root, 'r', encoding='gbk', errors='replace')
file2 = open(file2_root, 'r', encoding='gbk', errors='replace')
data1 = file1.read()
data2 = file2.read()
print("获取的字符串为:{}".format(data1))
print("比对的字符串为:{}".format(data2))
file1.close()
file2.close()
get_data = input("需要获取文件?Y/N\n")
if get_data == 'Y' or get_data == 'y':
print("以/>分隔的字符串,以end结束\n")
data1 = get_char()
data2 = input("比对的字符串为,以end结束:\n")
data2 = get_char()
file1 = open(file1_root, 'w+', encoding='gbk', errors='replace')
file1.write(data1)
file1.close()
file2 = open(file2_root, 'w+', encoding='gbk', errors='replace')
file2.write(data2)
file2.close()
else:
print('不获取新数据,处理历史数据\n')
data1 = data1.lower()
data2 = data2.lower()
count = []
key_list = []
list1 = data1.split("/>")
list1.append('')
list2 = get_list2(data2)
try:
# print(list1)
# print(list2)
for part in list1:
count_num = 0
key_part = []
for key in list2:
if part.find(key) != -1:
count_num = count_num + 1
key_part.append(key)
key_part_all = "".join(key_part)
count.append(count_num)
key_list.append(key_part_all)
print(len(count))
print(len(list1))
for i in range(len(count)):
for j in range(len(count) - i - 1):
if count[j] < count[j + 1]:
count[j], count[j + 1] = count[j + 1], count[j]
list1[j], list1[j + 1] = list1[j + 1], list1[j]
for i in range(100):
if count[i] <= count[0] * 0.5 and i > 0:
break
else:
print(list1[0])
print('**************************************************************************')
print('--------------------------------------------------------------------------')
print("符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[0], key_list[0], count[0]))
except Exception as e:
print(e)
var = input('点击重新操作?Y/N')
if var == 'Y' or var == 'y':
pass
else:
break