处理的是比较琐碎的任务。
主要功能是同义词分裂句子,下面是代码
import json
import re
import pickle
import numpy as np
def data_load(path):
'''
读取数据的函数
:param path:
:return: list,list
'''
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
question = []
ext_question = []
for i in data["RECORDS"]:
question.append(i["question"])
ext_question.append(i["ext_question"])
que = []
ext_que = []
for i, j in enumerate(ext_question):
if j != "null" and j != None and j != "]":
que.append(question[i])
ext_que.append(ext_question[i])
del question, ext_question
return que, ext_que
def get_length(sentence):
'''主要是获取每个句子分分裂为多少句子,
之前没有写这个函数,出现一个句子可以分裂成一亿句的情况,
分分钟爆内存'''
sentence = re.split(r'[\}\{]', sentence)
sentence = [re.split(r'[\[\]]', i) for i in sentence]
sentence = [[i for i in j if i != ''] for j in sentence]
data = []
for i in sentence:
if "|" in i:
data.append([''.join(i)])
else:
data.append(i)
data = [i for j in data for i in j]
data = [i.count("|") for i in data]
data = np.asarray(data)+1
return data.prod()
def q_extra(sentence):
'''
分裂句子的函数,为了处理不同格式的原始句子,写了不少规则
主要目的其实还是为了按|来分裂句子
分裂的方法本身应该使用递归回溯的方法,这边偷了个懒,使用python的字典去做
主要参考是leetcode-17
:param sentence:
:return: list
'''
sentence = re.split(r'[\}\{]', sentence)
sentence = [re.split(r'[\[\]]', i) for i in sentence]
sentence = [[i for i in j if i != ''] for j in sentence]
data = []
for i in sentence:
if "|" in i:
data.append([''.join(i)])
else:
data.append(i)
data = [i for j in data for i in j]
final = []
for i in data:
temp = i.split('|')
final.append(temp)
del data
res = [i for i in final[0]]
for i in range(1, len(final)):
res = [m+n for m in res for n in final[i]]
return res
if __name__ == '__main__':
path = "tbl_case_detail.json"
question, ext_question = data_load(path)
print(len(question), len(ext_question))
que =[]
ext_que = []
print(get_length(ext_question[2319]))
##这边还是做了一个分裂长度的判断,舍弃掉那些分裂过长的句子
for i,j in enumerate(ext_question):
if get_length(j) < 50 and get_length(j)>0:
que.append(question[i])
ext_que.append(ext_question[i])
print(len(que),len(ext_que))
data = []
for i,j in enumerate(ext_que):
data.append(q_extra(j))
print(i)
final_data = []
for i,j in enumerate(que):
for q in data[i]:
final_data.append(j.strip()+'¥'+q.strip()+"¥"+"1")
print(len(final_data))
with open("data_flie.utf8","w",encoding="utf-8") as f:
for line in final_data:
f.write(line+"\n")
代码写得还是不够精简完美,继续努力吧。
输入:[因为|原因][时钟|时候|时间][冲突][无法][完成|成功][mooc][课程][退出][学习][有什么?][影响]
[查询]{[视频]|[文件|文档]}[时间?][屏幕][查询][不全][具备|拥有|包含?][内容][什么是|是什么?][原因]
输出:分裂的句子