1、网络结构:对于问答匹配问题,很多人提出了深度学习的解决方案,可以尝试将问题及答案抽象成语义向量做内积
(如Learning Semantic Textual Similarity fromConversations),
也可以做外积,生成“相似矩阵”后使用卷积神经网络加以相似度提取
(如Convolutional Neural Network Architecturesfor Matching Natural Language Sentences 中李航的arcii结构,类似的使用在简单知识库的QA里也有,见Question Answering over Freebase viaAttentive RNN with Similarity Matrix based CNN)
常用的衡量两段文本向量的距离结构方式,还有语义匹配中常用的
(Learning Semantic Textual Similarity from Conversations及其他文献多有提及)
2、损失结构:在问答匹配的排序场景下,常用的损失是类似与svm hinge-loss的损失,
将得分逼迫收敛至固定区间的两端:
这种排序打分的损失设定是具有一般性的,也是在搜索场景learning to rank的一般损失,见
Towards Better Text Understanding and Retrieval through Kernel Entity Salience Modeling
及其他一般的搜索排序模型。
有了上面在网络及损失上的总的结构规定后,一些细节是可以比较随意的设置的,本文就对于
insuranceqa-corpus-zh (中文保险QA数据集)
https://github.com/Samurais/insuranceqa-corpus-zh
构造简单的QA排序模型,并进行测试。当作对于短文本QA排序模型的一个“总结”,没有什么论文支撑,瞎写而已。
采取的网络结构:
Lstm分别提取QA特征,将提取的隐状态放入全连接网络提取特征,之后进入
结构,最终得到得分后,对上述排序损失进行训练。
使用数据集:
https://github.com/Samurais/insuranceqa-corpus-zh
中已经进行分词,及编码的corpus数据集。
数据导出及训练网络结构:(使用train及valid)
import tensorflow as tf from collections import Counter import json from functools import reduce from collections import defaultdict import numpy as np from sklearn.utils import shuffle import os def str_count(): all_types = ["train", "valid", "test"] iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json" iqa_paths = map(lambda x: iqa_path_format.format(x, x),all_types) max_quest_len = 0 max_answer_len = 0 def count_idx(t3): iqa_path, max_quest_len, max_answer_len = t3 str_cnt = Counter() with open(iqa_path, "r", encoding="utf-8") as f: json_obj = json.load(f) for ele in json_obj: question = ele["question"] utterance = ele["utterance"] str_cnt.update(question) str_cnt.update(utterance) max_quest_len = max(len(question), max_quest_len) max_answer_len = max(len(utterance), max_answer_len) return str_cnt, max_quest_len, max_answer_len def reduce_item_func(items_a, items_b): req_dict = defaultdict(int) for k, v in list(items_a) + list(items_b): req_dict[k] += v return req_dict.items() str_cnt_list = [] for iqa_path in iqa_paths: t3 = (iqa_path, max_quest_len, max_answer_len) str_cnt, max_quest_len, max_answer_len = count_idx(t3) str_cnt_list.append(str_cnt) final_items = reduce(reduce_item_func, map(lambda x: x.items() ,str_cnt_list)) item_size = len(final_items) return item_size, max_quest_len, max_answer_len vocab_size, max_quest_len, max_answer_len = str_count() def data_gen(type = "train", max_quest_len = max_quest_len, max_answer_len = max_answer_len, batch_size = 64, padding_idx = vocab_size): assert type in ["train", "valid", "test"] iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json" iqa_path = iqa_path_format.format(type, type) with open(iqa_path, "r", encoding="utf-8") as f: json_obj = json.load(f) start_idx = 0 questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32) answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) key_json_obj = defaultdict(list) for ele in json_obj: qid = ele["qid"] key_json_obj[qid].append(ele) for k, ele_list in key_json_obj.items(): true_ele = list(filter(lambda ele: ele["label"] == [1, 0], ele_list))[0] false_eles = list(filter(lambda ele: ele["label"] == [0, 1], ele_list)) true_utterance = true_ele["utterance"] question = true_ele["question"] for ele in false_eles: false_utterance = ele["utterance"] for idx, w in enumerate(question): questions[start_idx][idx] = w for idx, w in enumerate(false_utterance): answers_1[start_idx][idx] = w for idx, w in enumerate(true_utterance): answers_2[start_idx][idx] = w start_idx += 1 if start_idx == batch_size: questions, answers_1, answers_2 = shuffle(questions, answers_1, answers_2) yield (questions, answers_1, answers_2) start_idx = 0 questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32) answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) class Pair(object): def __init__(self, word_embedding_size = 10, vocab_size = vocab_size, max_quest_len = max_quest_len, max_answer_len = max_answer_len, hidden_size = 10): self.hidden_size = hidden_size with tf.name_scope("word_embedding"): self.Word_Embedding = tf.Variable( tf.random_normal(shape=[vocab_size, word_embedding_size]) ) self.quest_input = tf.placeholder(dtype=tf.int32, shape=[None, max_quest_len]) self.answer_input_1 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len]) self.answer_input_2 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len]) self.keep_prob = tf.placeholder(dtype=tf.float32, shape=[]) self.l2_param = tf.placeholder(dtype=tf.float32, shape=[]) self.model_construct() def score_func(self, q_part, a_part): q_output_states = q_part a_output_states = a_part abs_part = tf.abs(q_output_states - a_output_states) multiply_part = q_output_states * a_output_states fuse_part = tf.concat([q_output_states, a_output_states, abs_part, multiply_part], axis=-1, name="fuse_part") hidden_layer = tf.layers.dense(inputs=fuse_part, units=100, name="hidden_layer", reuse=tf.AUTO_REUSE) score_layer = tf.layers.dense(inputs=hidden_layer, units=1, name="score_layer", reuse=tf.AUTO_REUSE) return score_layer def model_construct(self): quest_input = tf.nn.embedding_lookup(self.Word_Embedding, self.quest_input, name="quest_input") answer_input_1 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_1, name="answer_input_1") answer_input_2 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_2, name="answer_input_2") quest_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="quest_cell") answer_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="answer_cell", reuse=tf.AUTO_REUSE) quest_cell = tf.nn.rnn_cell.DropoutWrapper(quest_cell, input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) answer_cell = tf.nn.rnn_cell.DropoutWrapper(answer_cell, input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) q_outputs, q_output_states = tf.nn.dynamic_rnn(cell = quest_cell, inputs=quest_input, dtype=tf.float32) a_outputs_1, a_output_states_1 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_1, dtype=tf.float32) a_outputs_2, a_output_states_2 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_2, dtype=tf.float32) q_last_dim = int(q_outputs.get_shape()[-1]) * int(q_outputs.get_shape()[-2]) q_part = tf.reshape(q_outputs, [-1, q_last_dim]) q_part = tf.layers.dense(q_part, units=self.hidden_size, name="q_part") a_last_dim_1 = int(a_outputs_1.get_shape()[-1]) * int(a_outputs_1.get_shape()[-2]) a_part_1 = tf.reshape(a_outputs_1, [-1, a_last_dim_1]) a_part_1 = tf.layers.dense(a_part_1, units=self.hidden_size, name="a_part") a_last_dim_2 = int(a_outputs_2.get_shape()[-1]) * int(a_outputs_2.get_shape()[-2]) a_part_2 = tf.reshape(a_outputs_2, [-1, a_last_dim_2]) a_part_2 = tf.layers.dense(a_part_2, units=self.hidden_size, name="a_part", reuse=tf.AUTO_REUSE) self.qa_score_1 = self.score_func(q_part, a_part_1) self.qa_score_2 = self.score_func(q_part, a_part_2) self.loss = tf.reduce_mean(tf.nn.relu(1 + self.qa_score_1 - self.qa_score_2)) self.l2_loss = None for train_able_var in tf.trainable_variables(): if self.l2_loss is None: self.l2_loss = tf.nn.l2_loss(train_able_var) else: self.l2_loss += tf.nn.l2_loss(train_able_var) self.loss = self.loss + self.l2_param * self.l2_loss self.train_op = tf.train.AdamOptimizer(0.0001).minimize(self.loss) @staticmethod def train(): train_gen = data_gen(type="train") valid_gen = data_gen(type="valid") pair = Pair() saver = tf.train.Saver() with tf.Session() as sess: if os.path.exists(r"E:\Coding\python\InsurranceQA\sort_model.ckpt.index"): print("restore exists") saver.restore(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt") else: print("init global") sess.run(tf.global_variables_initializer()) times = 0 epochs = 0 while True: try: questions, answers_1, answers_2 = train_gen.__next__() except: print("epoch {} end".format(epochs)) epochs += 1 train_gen = data_gen(type="train") questions, answers_1, answers_2 = train_gen.__next__() _, loss = sess.run([pair.train_op ,pair.loss], feed_dict={ pair.quest_input: questions, pair.answer_input_1: answers_1, pair.answer_input_2: answers_2, pair.l2_param: 0.000001, pair.keep_prob: 0.7 }) times += 1 if times % 5 == 0: print("train loss: {}".format(loss)) try: questions, answers_1, answers_2 = valid_gen.__next__() except: print("one valid epoch end") valid_gen = data_gen(type="valid") questions, answers_1, answers_2 = valid_gen.__next__() qa_score_1,qa_score_2 ,loss = sess.run([pair.qa_score_1, pair.qa_score_2 ,pair.loss], feed_dict={ pair.quest_input: questions, pair.answer_input_1: answers_1, pair.answer_input_2: answers_2, pair.l2_param: 0.0, pair.keep_prob: 1.0 }) print(np.mean(qa_score_2 > qa_score_1).astype(np.float32)) print("valid loss: {}".format(loss)) saver.save(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt") if __name__ == "__main__": Pair.train()
效果预测代码(使用test进行效果判定)
import tensorflow as tf from collections import Counter import json from functools import reduce from collections import defaultdict import numpy as np from sklearn.utils import shuffle def str_count(): all_types = ["train", "valid", "test"] iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json" iqa_paths = map(lambda x: iqa_path_format.format(x, x),all_types) max_quest_len = 0 max_answer_len = 0 def count_idx(t3): iqa_path, max_quest_len, max_answer_len = t3 str_cnt = Counter() with open(iqa_path, "r", encoding="utf-8") as f: json_obj = json.load(f) for ele in json_obj: question = ele["question"] utterance = ele["utterance"] str_cnt.update(question) str_cnt.update(utterance) max_quest_len = max(len(question), max_quest_len) max_answer_len = max(len(utterance), max_answer_len) return str_cnt, max_quest_len, max_answer_len def reduce_item_func(items_a, items_b): req_dict = defaultdict(int) for k, v in list(items_a) + list(items_b): req_dict[k] += v return req_dict.items() str_cnt_list = [] for iqa_path in iqa_paths: t3 = (iqa_path, max_quest_len, max_answer_len) str_cnt, max_quest_len, max_answer_len = count_idx(t3) str_cnt_list.append(str_cnt) final_items = reduce(reduce_item_func, map(lambda x: x.items() ,str_cnt_list)) item_size = len(final_items) return item_size, max_quest_len, max_answer_len vocab_size, max_quest_len, max_answer_len = str_count() def data_gen(type = "train", max_quest_len = max_quest_len, max_answer_len = max_answer_len, padding_idx = vocab_size): assert type in ["train", "valid", "test"] iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json" iqa_path = iqa_path_format.format(type, type) with open(iqa_path, "r", encoding="utf-8") as f: json_obj = json.load(f) key_json_obj = defaultdict(list) for ele in json_obj: qid = ele["qid"] key_json_obj[qid].append(ele) for k, ele_list in key_json_obj.items(): batch_size = len(ele_list) - 1 start_idx = 0 questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32) answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32) true_ele = list(filter(lambda ele: ele["label"] == [1, 0], ele_list))[0] false_eles = list(filter(lambda ele: ele["label"] == [0, 1], ele_list)) true_utterance = true_ele["utterance"] question = true_ele["question"] for ele in false_eles: false_utterance = ele["utterance"] for idx, w in enumerate(question): questions[start_idx][idx] = w for idx, w in enumerate(false_utterance): answers_1[start_idx][idx] = w for idx, w in enumerate(true_utterance): answers_2[start_idx][idx] = w start_idx += 1 if start_idx == batch_size: questions, answers_1, answers_2 = shuffle(questions, answers_1, answers_2) yield (questions, answers_1, answers_2) class Pair(object): def __init__(self, word_embedding_size = 10, vocab_size = vocab_size, max_quest_len = max_quest_len, max_answer_len = max_answer_len, hidden_size = 10): self.hidden_size = hidden_size with tf.name_scope("word_embedding"): self.Word_Embedding = tf.Variable( tf.random_normal(shape=[vocab_size, word_embedding_size]) ) self.quest_input = tf.placeholder(dtype=tf.int32, shape=[None, max_quest_len]) self.answer_input_1 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len]) self.answer_input_2 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len]) self.keep_prob = tf.placeholder(dtype=tf.float32, shape=[]) self.l2_param = tf.placeholder(dtype=tf.float32, shape=[]) self.model_construct() def score_func(self, q_part, a_part): q_output_states = q_part a_output_states = a_part abs_part = tf.abs(q_output_states - a_output_states) multiply_part = q_output_states * a_output_states fuse_part = tf.concat([q_output_states, a_output_states, abs_part, multiply_part], axis=-1, name="fuse_part") hidden_layer = tf.layers.dense(inputs=fuse_part, units=100, name="hidden_layer", reuse=tf.AUTO_REUSE) score_layer = tf.layers.dense(inputs=hidden_layer, units=1, name="score_layer", reuse=tf.AUTO_REUSE) return score_layer def model_construct(self): quest_input = tf.nn.embedding_lookup(self.Word_Embedding, self.quest_input, name="quest_input") answer_input_1 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_1, name="answer_input_1") answer_input_2 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_2, name="answer_input_2") quest_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="quest_cell") answer_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="answer_cell", reuse=tf.AUTO_REUSE) quest_cell = tf.nn.rnn_cell.DropoutWrapper(quest_cell, input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) answer_cell = tf.nn.rnn_cell.DropoutWrapper(answer_cell, input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) q_outputs, q_output_states = tf.nn.dynamic_rnn(cell = quest_cell, inputs=quest_input, dtype=tf.float32) a_outputs_1, a_output_states_1 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_1, dtype=tf.float32) a_outputs_2, a_output_states_2 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_2, dtype=tf.float32) q_last_dim = int(q_outputs.get_shape()[-1]) * int(q_outputs.get_shape()[-2]) q_part = tf.reshape(q_outputs, [-1, q_last_dim]) q_part = tf.layers.dense(q_part, units=self.hidden_size, name="q_part") a_last_dim_1 = int(a_outputs_1.get_shape()[-1]) * int(a_outputs_1.get_shape()[-2]) a_part_1 = tf.reshape(a_outputs_1, [-1, a_last_dim_1]) a_part_1 = tf.layers.dense(a_part_1, units=self.hidden_size, name="a_part") a_last_dim_2 = int(a_outputs_2.get_shape()[-1]) * int(a_outputs_2.get_shape()[-2]) a_part_2 = tf.reshape(a_outputs_2, [-1, a_last_dim_2]) a_part_2 = tf.layers.dense(a_part_2, units=self.hidden_size, name="a_part", reuse=tf.AUTO_REUSE) self.qa_score_1 = self.score_func(q_part, a_part_1) self.qa_score_2 = self.score_func(q_part, a_part_2) self.loss = tf.reduce_mean(tf.nn.relu(1 + self.qa_score_1 - self.qa_score_2)) self.l2_loss = None for train_able_var in tf.trainable_variables(): if self.l2_loss is None: self.l2_loss = tf.nn.l2_loss(train_able_var) else: self.l2_loss += tf.nn.l2_loss(train_able_var) self.loss = self.loss + self.l2_param * self.l2_loss self.train_op = tf.train.AdamOptimizer(0.0001).minimize(self.loss) @staticmethod def pred_test(): valid_gen = data_gen(type="test") pair = Pair() saver = tf.train.Saver() all_acc_list = [] with tf.Session(config=tf.ConfigProto(device_count={'gpu':0})) as sess: saver.restore(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt") times = 0 while True: try: questions, answers_1, answers_2 = valid_gen.__next__() except: valid_gen = data_gen(type="test") questions, answers_1, answers_2 = valid_gen.__next__() qa_score_1, qa_score_2 = sess.run([pair.qa_score_1, pair.qa_score_2], feed_dict={ pair.quest_input: questions, pair.answer_input_1: answers_1, pair.answer_input_2: answers_2, pair.l2_param: 0.0, pair.keep_prob: 1.0 }) print("acc :{}".format(np.mean(qa_score_2 > qa_score_1).astype(np.float32))) all_acc_list.append(np.mean(qa_score_2 > qa_score_1).astype(np.float32)) print("avg acc :{}".format(np.mean(all_acc_list))) times += 1 @staticmethod def train(): train_gen = data_gen(type="train") valid_gen = data_gen(type="valid") pair = Pair() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) times = 0 epochs = 0 while True: try: questions, answers_1, answers_2 = train_gen.__next__() except: print("epoch {} end".format(epochs)) epochs += 1 train_gen = data_gen(type="train") questions, answers_1, answers_2 = train_gen.__next__() _, loss = sess.run([pair.train_op ,pair.loss], feed_dict={ pair.quest_input: questions, pair.answer_input_1: answers_1, pair.answer_input_2: answers_2, pair.l2_param: 0.000001, pair.keep_prob: 0.7 }) times += 1 if times % 5 == 0: print("train loss: {}".format(loss)) try: questions, answers_1, answers_2 = valid_gen.__next__() except: valid_gen = data_gen(type="valid") questions, answers_1, answers_2 = valid_gen.__next__() qa_score_1,qa_score_2 ,loss = sess.run([pair.qa_score_1, pair.qa_score_2 ,pair.loss], feed_dict={ pair.quest_input: questions, pair.answer_input_1: answers_1, pair.answer_input_2: answers_2, pair.l2_param: 0.0, pair.keep_prob: 1.0 }) print(np.mean(qa_score_2 > qa_score_1).astype(np.float32)) print("valid loss: {}".format(loss)) saver.save(sess, save_path="E:\Coding\python\InsurranceQA\sort_model.ckpt") if __name__ == "__main__": Pair.pred_test()
这里每一个acc给出正样本得分大于所有负样本得分的比率(可以看作是topK意义上的精度),avg acc 为累加到现阶段的平均topK精度。
下面是训练一个epoch后pred_test的输出例子:
acc :0.5 avg acc :0.5 acc :0.800000011920929 avg acc :0.6499999761581421 acc :0.8999999761581421 avg acc :0.7333332896232605 acc :1.0 avg acc :0.7999999523162842 acc :0.699999988079071 avg acc :0.7799999713897705 acc :1.0 avg acc :0.8166666030883789 acc :1.0 avg acc :0.8428570628166199 acc :0.6000000238418579 avg acc :0.8125 acc :1.0 avg acc :0.8333333134651184 acc :1.0 avg acc :0.8500000238418579 acc :0.800000011920929 avg acc :0.8454545736312866 acc :1.0 avg acc :0.8583333492279053 acc :1.0 avg acc :0.8692308068275452 acc :0.800000011920929 avg acc :0.8642857670783997 acc :0.8999999761581421 avg acc :0.8666666746139526 acc :0.8999999761581421 avg acc :0.8687499761581421
感兴趣还可以对比参看https://github.com/white127/insuranceQA-cnn-lstm中的top1精度。