依存关系本身是一个树结构,每一个词看成一个节点,依存关系就是一条有向边。本文主要通过清华大学的句法标注语料库。
实现代码:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.externals import joblib
class CorpusProcess(object):
def __init__(self):
"""初始化"""
self.train_process_path = "D://input_py//day17//train.data" #预处理之后的训练集
self.test_process_path = "D://input_py//day17//dev.data" #预处理之后的测试集
def read_corpus_from_file(self, file_path):
"""读取语料"""
f = open(file_path, 'r') # ,encoding='utf-8'
lines = f.readlines()
f.close()
return lines
def write_corpus_to_file(self, data, file_path):
"""写语料"""
f = open(file_path, 'w')
f.write(str(data))
f.close()
def process_sentence(self,lines):
"""处理句子"""
sentence = []
for line in lines:
if not line.strip():
yield sentence
sentence = []
else:
lines = line.strip().split(u'\t')
result = [line for line in lines]
sentence.append(result)
def initialize(self):
"""语料初始化"""
train_lines = self.read_corpus_from_file(self.train_process_path)
test_lines = self.read_corpus_from_file(self.test_process_path)
self.train_sentences = [sentence for sentence in self.process_sentence(train_lines)]
self.test_sentences = [sentence for sentence in self.process_sentence(test_lines)]
def generator(self, train=True):
"""特征生成器"""
if train:
sentences = self.train_sentences
else:
sentences = self.test_sentences
return self.extract_feature(sentences)
def extract_feature(self, sentences):
"""提取特征"""
features, tags = [], []
for index in range(len(sentences)):
feature_list, tag_list = [], []
for i in range(len(sentences[index])):
feature = {"w0": sentences[index][i][0],
"p0": sentences[index][i][1],
"w-1": sentences[index][i-1][0] if i != 0 else "BOS",
"w+1": sentences[index][i+1][0] if i != len(sentences[index])-1 else "EOS",
"p-1": sentences[index][i-1][1] if i != 0 else "un",
"p+1": sentences[index][i+1][1] if i != len(sentences[index])-1 else "un"}
feature["w-1:w0"] = feature["w-1"]+feature["w0"]
feature["w0:w+1"] = feature["w0"]+feature["w+1"]
feature["p-1:p0"] = feature["p-1"]+feature["p0"]
feature["p0:p+1"] = feature["p0"]+feature["p+1"]
feature["p-1:w0"] = feature["p-1"]+feature["w0"]
feature["w0:p+1"] = feature["w0"]+feature["p+1"]
feature_list.append(feature)
tag_list.append(sentences[index][i][-1])
features.append(feature_list)
tags.append(tag_list)
return features, tags
class ModelParser(object):
def __init__(self):
"""初始化参数"""
self.algorithm = "lbfgs"
self.c1 = 0.1
self.c2 = 0.1
self.max_iterations = 100
self.model_path = "model.pkl"
self.corpus = CorpusProcess() #初始化CorpusProcess类
self.corpus.initialize() #语料预处理
self.model = None
def initialize_model(self):
"""模型初始化"""
algorithm = self.algorithm
c1 = float(self.c1)
c2 = float(self.c2)
max_iterations = int(self.max_iterations)
self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,
max_iterations=max_iterations, all_possible_transitions=True)
def train(self):
"""训练"""
self.initialize_model()
x_train, y_train = self.corpus.generator()
self.model.fit(x_train, y_train)
labels = list(self.model.classes_)
x_test, y_test = self.corpus.generator(train=False)
y_predict = self.model.predict(x_test)
metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels)
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3))
self.save_model()
def predict(self, sentences):
"""模型预测"""
self.load_model()
features, _ = self.corpus.extract_feature(sentences)
return self.model.predict(features)
def load_model(self, name='model'):
"""加载模型 """
self.model = joblib.load(self.model_path)
def save_model(self, name='model'):
"""保存模型"""
joblib.dump(self.model, self.model_path)
model = ModelParser()
model.train()
运行结果:
precision recall f1-score support
-1_a 0.811 0.796 0.804 221
-1_b 0.783 0.770 0.777 61
-1_c 0.000 0.000 0.000 5
-1_d 0.711 0.409 0.519 66
-1_f 0.867 0.565 0.684 23
-1_h 0.000 0.000 0.000 0
-1_k 0.667 1.000 0.800 2
-1_m 0.905 0.895 0.900 256
-1_n 0.720 0.754 0.737 967
11_n 0.000 0.000 0.000 0
-1_ng 0.000 0.000 0.000 23
-1_nl 0.750 0.143 0.240 21
-1_nr 1.000 0.059 0.111 17
-1_nr1 0.000 0.000 0.000 0
-1_nr2 0.000 0.000 0.000 0
-1_nrf 0.000 0.000 0.000 25
-1_nrj 0.000 0.000 0.000 2
-1_ns 0.870 0.400 0.548 50
-1_nsf 0.822 0.402 0.540 92
-1_nt 0.667 0.286 0.400 14
-1_nz 0.000 0.000 0.000 7
-1_o 0.000 0.000 0.000 1
-1_p 0.524 0.214 0.303 103
-1_q 0.706 0.649 0.676 37
-1_r 0.946 0.841 0.891 63
-1_s 0.737 0.933 0.824 15
-1_t 0.952 0.894 0.922 66
-1_u 0.000 0.000 0.000 3
-1_v 0.628 0.669 0.648 2396
-1_x 0.000 0.000 0.000 0
-1_z 0.875 0.636 0.737 11
-2_a 0.800 0.364 0.500 11
-2_b 0.000 0.000 0.000 1
-2_c 0.000 0.000 0.000 0
-2_d 0.000 0.000 0.000 2
-2_f 0.000 0.000 0.000 1
-2_m 0.897 0.876 0.886 89
-2_n 0.095 0.021 0.034 95
-2_ng 0.000 0.000 0.000 0
-2_nl 0.000 0.000 0.000 1
-2_nr 0.000 0.000 0.000 1
-2_nr2 0.000 0.000 0.000 0
-2_nrf 0.000 0.000 0.000 3
-2_ns 0.000 0.000 0.000 3
-2_nsf 0.000 0.000 0.000 3
-2_nz 0.000 0.000 0.000 1
-2_p 0.000 0.000 0.000 9
-2_q 0.000 0.000 0.000 2
-2_r 0.000 0.000 0.000 3
-2_s 0.000 0.000 0.000 0
-2_t 1.000 0.920 0.958 25
-2_u 0.000 0.000 0.000 0
-2_v 0.326 0.211 0.256 445
-2_z 0.000 0.000 0.000 0
-3_a 1.000 0.500 0.667 2
-3_b 0.000 0.000 0.000 0
-3_d 0.000 0.000 0.000 0
-3_m 0.625 0.750 0.682 20
-3_n 0.000 0.000 0.000 20
-3_nl 0.000 0.000 0.000 0
-3_nsf 0.000 0.000 0.000 0
-3_p 0.000 0.000 0.000 1
-3_q 0.000 0.000 0.000 0
-3_t 1.000 0.571 0.727 14
-3_v 0.125 0.045 0.066 112
-4_b 0.000 0.000 0.000 0
-4_d 0.000 0.000 0.000 0
-4_m 0.500 0.200 0.286 5
-4_n 0.000 0.000 0.000 8
-4_nsf 0.000 0.000 0.000 0
-4_p 0.000 0.000 0.000 1
-4_t 0.000 0.000 0.000 0
-4_v 0.000 0.000 0.000 33