QA 问答对排序模型小记

1、网络结构：对于问答匹配问题，很多人提出了深度学习的解决方案，可以尝试将问题及答案抽象成语义向量做内积

（如Learning Semantic Textual Similarity fromConversations），

也可以做外积，生成“相似矩阵”后使用卷积神经网络加以相似度提取

（如Convolutional Neural Network Architecturesfor Matching Natural Language Sentences 中李航的arcii结构，类似的使用在简单知识库的QA里也有，见Question Answering over Freebase viaAttentive RNN with Similarity Matrix based CNN）

常用的衡量两段文本向量的距离结构方式，还有语义匹配中常用的

（Learning Semantic Textual Similarity from Conversations及其他文献多有提及）

2、损失结构：在问答匹配的排序场景下，常用的损失是类似与svm hinge-loss的损失，

将得分逼迫收敛至固定区间的两端：

这种排序打分的损失设定是具有一般性的，也是在搜索场景learning to rank的一般损失，见

Towards Better Text Understanding and Retrieval through Kernel Entity Salience Modeling

及其他一般的搜索排序模型。

有了上面在网络及损失上的总的结构规定后，一些细节是可以比较随意的设置的，本文就对于

insuranceqa-corpus-zh (中文保险QA数据集)

https://github.com/Samurais/insuranceqa-corpus-zh

构造简单的QA排序模型，并进行测试。当作对于短文本QA排序模型的一个“总结”，没有什么论文支撑，瞎写而已。

采取的网络结构：

Lstm分别提取QA特征，将提取的隐状态放入全连接网络提取特征，之后进入

结构，最终得到得分后，对上述排序损失进行训练。

使用数据集：

https://github.com/Samurais/insuranceqa-corpus-zh

中已经进行分词，及编码的corpus数据集。

数据导出及训练网络结构：（使用train及valid）

import tensorflow as tf
from collections import Counter
import json
from functools import reduce
from collections import defaultdict
import numpy as np
from sklearn.utils import shuffle
import os

def str_count():
    all_types = ["train", "valid", "test"]
    iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json"
    iqa_paths = map(lambda x: iqa_path_format.format(x, x),all_types)
    max_quest_len = 0
    max_answer_len = 0

    def count_idx(t3):
        iqa_path, max_quest_len, max_answer_len = t3
        str_cnt = Counter()
        with open(iqa_path, "r", encoding="utf-8") as f:
            json_obj = json.load(f)

            for ele in json_obj:
                question = ele["question"]
                utterance = ele["utterance"]
                str_cnt.update(question)
                str_cnt.update(utterance)
                max_quest_len = max(len(question), max_quest_len)
                max_answer_len = max(len(utterance), max_answer_len)
            return str_cnt, max_quest_len, max_answer_len

    def reduce_item_func(items_a, items_b):
        req_dict = defaultdict(int)
        for k, v in list(items_a) + list(items_b):
            req_dict[k] += v
        return req_dict.items()

    str_cnt_list = []
    for iqa_path in iqa_paths:
        t3 = (iqa_path, max_quest_len, max_answer_len)
        str_cnt, max_quest_len, max_answer_len = count_idx(t3)
        str_cnt_list.append(str_cnt)

    final_items = reduce(reduce_item_func, map(lambda x: x.items() ,str_cnt_list))
    item_size = len(final_items)
    return item_size, max_quest_len, max_answer_len

vocab_size, max_quest_len, max_answer_len = str_count()

def data_gen(type = "train", max_quest_len = max_quest_len, max_answer_len = max_answer_len, batch_size = 64,
             padding_idx = vocab_size):
    assert  type in ["train", "valid", "test"]

    iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json"
    iqa_path = iqa_path_format.format(type, type)
    with open(iqa_path, "r", encoding="utf-8") as f:
        json_obj = json.load(f)
    start_idx = 0
    questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32)
    answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)
    answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)

    key_json_obj = defaultdict(list)
    for ele in json_obj:
        qid = ele["qid"]
        key_json_obj[qid].append(ele)

    for k, ele_list in key_json_obj.items():
        true_ele = list(filter(lambda ele: ele["label"] == [1, 0], ele_list))[0]
        false_eles = list(filter(lambda ele: ele["label"] == [0, 1], ele_list))
        true_utterance = true_ele["utterance"]
        question = true_ele["question"]
        for ele in false_eles:
            false_utterance = ele["utterance"]
            for idx, w in enumerate(question):
                questions[start_idx][idx] = w
            for idx, w in enumerate(false_utterance):
                answers_1[start_idx][idx] = w
            for idx, w in enumerate(true_utterance):
                answers_2[start_idx][idx] = w
            start_idx += 1
            if start_idx == batch_size:
                questions, answers_1, answers_2 = shuffle(questions, answers_1, answers_2)
                yield (questions, answers_1, answers_2)
                start_idx = 0
                questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32)
                answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)
                answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)

class Pair(object):
    def __init__(self, word_embedding_size = 10, vocab_size = vocab_size, max_quest_len = max_quest_len, max_answer_len = max_answer_len,
                 hidden_size = 10):
        self.hidden_size = hidden_size
        with tf.name_scope("word_embedding"):
            self.Word_Embedding = tf.Variable(
                tf.random_normal(shape=[vocab_size, word_embedding_size])
            )

        self.quest_input = tf.placeholder(dtype=tf.int32, shape=[None, max_quest_len])
        self.answer_input_1 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len])
        self.answer_input_2 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len])

        self.keep_prob = tf.placeholder(dtype=tf.float32, shape=[])
        self.l2_param = tf.placeholder(dtype=tf.float32, shape=[])

        self.model_construct()

    def score_func(self, q_part, a_part):
        q_output_states = q_part
        a_output_states = a_part

        abs_part = tf.abs(q_output_states - a_output_states)
        multiply_part = q_output_states * a_output_states
        fuse_part = tf.concat([q_output_states, a_output_states, abs_part, multiply_part], axis=-1, name="fuse_part")

        hidden_layer = tf.layers.dense(inputs=fuse_part, units=100, name="hidden_layer", reuse=tf.AUTO_REUSE)
        score_layer = tf.layers.dense(inputs=hidden_layer, units=1, name="score_layer", reuse=tf.AUTO_REUSE)
        return score_layer

    def model_construct(self):
        quest_input = tf.nn.embedding_lookup(self.Word_Embedding, self.quest_input, name="quest_input")
        answer_input_1 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_1, name="answer_input_1")
        answer_input_2 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_2, name="answer_input_2")

        quest_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="quest_cell")
        answer_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="answer_cell", reuse=tf.AUTO_REUSE)

        quest_cell = tf.nn.rnn_cell.DropoutWrapper(quest_cell, input_keep_prob=self.keep_prob,
                                                   output_keep_prob=self.keep_prob)
        answer_cell = tf.nn.rnn_cell.DropoutWrapper(answer_cell, input_keep_prob=self.keep_prob,
                                                    output_keep_prob=self.keep_prob)

        q_outputs, q_output_states = tf.nn.dynamic_rnn(cell = quest_cell, inputs=quest_input, dtype=tf.float32)
        a_outputs_1, a_output_states_1 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_1, dtype=tf.float32)
        a_outputs_2, a_output_states_2 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_2, dtype=tf.float32)

        q_last_dim = int(q_outputs.get_shape()[-1]) * int(q_outputs.get_shape()[-2])
        q_part = tf.reshape(q_outputs, [-1, q_last_dim])
        q_part = tf.layers.dense(q_part, units=self.hidden_size, name="q_part")
        a_last_dim_1 = int(a_outputs_1.get_shape()[-1]) * int(a_outputs_1.get_shape()[-2])
        a_part_1 = tf.reshape(a_outputs_1, [-1, a_last_dim_1])
        a_part_1 = tf.layers.dense(a_part_1, units=self.hidden_size, name="a_part")
        a_last_dim_2 = int(a_outputs_2.get_shape()[-1]) * int(a_outputs_2.get_shape()[-2])
        a_part_2 = tf.reshape(a_outputs_2, [-1, a_last_dim_2])
        a_part_2 = tf.layers.dense(a_part_2, units=self.hidden_size, name="a_part", reuse=tf.AUTO_REUSE)

        self.qa_score_1 = self.score_func(q_part, a_part_1)
        self.qa_score_2 = self.score_func(q_part, a_part_2)

        self.loss = tf.reduce_mean(tf.nn.relu(1 + self.qa_score_1 - self.qa_score_2))

        self.l2_loss = None
        for train_able_var in tf.trainable_variables():
            if self.l2_loss is None:
                self.l2_loss = tf.nn.l2_loss(train_able_var)
            else:
                self.l2_loss += tf.nn.l2_loss(train_able_var)
        self.loss = self.loss + self.l2_param * self.l2_loss

        self.train_op = tf.train.AdamOptimizer(0.0001).minimize(self.loss)

    @staticmethod
    def train():
        train_gen = data_gen(type="train")
        valid_gen = data_gen(type="valid")

        pair = Pair()
        saver = tf.train.Saver()
        with tf.Session() as sess:
            if os.path.exists(r"E:\Coding\python\InsurranceQA\sort_model.ckpt.index"):
                print("restore exists")
                saver.restore(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt")
            else:
                print("init global")
                sess.run(tf.global_variables_initializer())

            times = 0
            epochs = 0
            while True:
                try:
                    questions, answers_1, answers_2 = train_gen.__next__()
                except:
                    print("epoch {} end".format(epochs))
                    epochs += 1
                    train_gen = data_gen(type="train")
                    questions, answers_1, answers_2 = train_gen.__next__()

                _, loss = sess.run([pair.train_op ,pair.loss],
                                        feed_dict={
                                            pair.quest_input: questions,
                                            pair.answer_input_1: answers_1,
                                            pair.answer_input_2: answers_2,
                                            pair.l2_param: 0.000001,
                                            pair.keep_prob: 0.7
                                        })

                times += 1
                if times % 5 == 0:
                    print("train loss: {}".format(loss))
                    try:
                        questions, answers_1, answers_2 = valid_gen.__next__()
                    except:
                        print("one valid epoch end")
                        valid_gen = data_gen(type="valid")
                        questions, answers_1, answers_2 = valid_gen.__next__()

                    qa_score_1,qa_score_2 ,loss = sess.run([pair.qa_score_1, pair.qa_score_2 ,pair.loss],
                                               feed_dict={
                                                   pair.quest_input: questions,
                                                   pair.answer_input_1: answers_1,
                                                   pair.answer_input_2: answers_2,
                                                   pair.l2_param: 0.0,
                                                   pair.keep_prob: 1.0
                                               })
                    print(np.mean(qa_score_2 > qa_score_1).astype(np.float32))

                    print("valid loss: {}".format(loss))
                    saver.save(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt")

if __name__ == "__main__":
    Pair.train()

效果预测代码（使用test进行效果判定）

import tensorflow as tf
from collections import Counter
import json
from functools import reduce
from collections import defaultdict
import numpy as np
from sklearn.utils import shuffle

def str_count():
    all_types = ["train", "valid", "test"]
    iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json"
    iqa_paths = map(lambda x: iqa_path_format.format(x, x),all_types)
    max_quest_len = 0
    max_answer_len = 0

    def count_idx(t3):
        iqa_path, max_quest_len, max_answer_len = t3
        str_cnt = Counter()
        with open(iqa_path, "r", encoding="utf-8") as f:
            json_obj = json.load(f)

            for ele in json_obj:
                question = ele["question"]
                utterance = ele["utterance"]
                str_cnt.update(question)
                str_cnt.update(utterance)
                max_quest_len = max(len(question), max_quest_len)
                max_answer_len = max(len(utterance), max_answer_len)
            return str_cnt, max_quest_len, max_answer_len

    def reduce_item_func(items_a, items_b):
        req_dict = defaultdict(int)
        for k, v in list(items_a) + list(items_b):
            req_dict[k] += v
        return req_dict.items()

    str_cnt_list = []
    for iqa_path in iqa_paths:
        t3 = (iqa_path, max_quest_len, max_answer_len)
        str_cnt, max_quest_len, max_answer_len = count_idx(t3)
        str_cnt_list.append(str_cnt)

    final_items = reduce(reduce_item_func, map(lambda x: x.items() ,str_cnt_list))
    item_size = len(final_items)
    return item_size, max_quest_len, max_answer_len

vocab_size, max_quest_len, max_answer_len = str_count()

def data_gen(type = "train", max_quest_len = max_quest_len, max_answer_len = max_answer_len,
             padding_idx = vocab_size):
    assert  type in ["train", "valid", "test"]

    iqa_path_format = r"data/iqa.{}.json/iqa.{}.tokenlized.pair.json"
    iqa_path = iqa_path_format.format(type, type)
    with open(iqa_path, "r", encoding="utf-8") as f:
        json_obj = json.load(f)

    key_json_obj = defaultdict(list)
    for ele in json_obj:
        qid = ele["qid"]
        key_json_obj[qid].append(ele)

    for k, ele_list in key_json_obj.items():
        batch_size = len(ele_list) - 1
        start_idx = 0
        questions = np.full(shape=[batch_size, max_quest_len], fill_value=padding_idx).astype(np.int32)
        answers_1 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)
        answers_2 = np.full(shape=[batch_size, max_answer_len], fill_value=padding_idx).astype(np.int32)

        true_ele = list(filter(lambda ele: ele["label"] == [1, 0], ele_list))[0]
        false_eles = list(filter(lambda ele: ele["label"] == [0, 1], ele_list))
        true_utterance = true_ele["utterance"]
        question = true_ele["question"]
        for ele in false_eles:
            false_utterance = ele["utterance"]
            for idx, w in enumerate(question):
                questions[start_idx][idx] = w
            for idx, w in enumerate(false_utterance):
                answers_1[start_idx][idx] = w
            for idx, w in enumerate(true_utterance):
                answers_2[start_idx][idx] = w
            start_idx += 1
            if start_idx == batch_size:
                questions, answers_1, answers_2 = shuffle(questions, answers_1, answers_2)
                yield (questions, answers_1, answers_2)

class Pair(object):
    def __init__(self, word_embedding_size = 10, vocab_size = vocab_size, max_quest_len = max_quest_len, max_answer_len = max_answer_len,
                 hidden_size = 10):
        self.hidden_size = hidden_size
        with tf.name_scope("word_embedding"):
            self.Word_Embedding = tf.Variable(
                tf.random_normal(shape=[vocab_size, word_embedding_size])
            )

        self.quest_input = tf.placeholder(dtype=tf.int32, shape=[None, max_quest_len])
        self.answer_input_1 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len])
        self.answer_input_2 = tf.placeholder(dtype=tf.int32, shape=[None, max_answer_len])

        self.keep_prob = tf.placeholder(dtype=tf.float32, shape=[])
        self.l2_param = tf.placeholder(dtype=tf.float32, shape=[])

        self.model_construct()

    def score_func(self, q_part, a_part):
        q_output_states = q_part
        a_output_states = a_part

        abs_part = tf.abs(q_output_states - a_output_states)
        multiply_part = q_output_states * a_output_states
        fuse_part = tf.concat([q_output_states, a_output_states, abs_part, multiply_part], axis=-1, name="fuse_part")

        hidden_layer = tf.layers.dense(inputs=fuse_part, units=100, name="hidden_layer", reuse=tf.AUTO_REUSE)
        score_layer = tf.layers.dense(inputs=hidden_layer, units=1, name="score_layer", reuse=tf.AUTO_REUSE)
        return score_layer

    def model_construct(self):
        quest_input = tf.nn.embedding_lookup(self.Word_Embedding, self.quest_input, name="quest_input")
        answer_input_1 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_1, name="answer_input_1")
        answer_input_2 = tf.nn.embedding_lookup(self.Word_Embedding, self.answer_input_2, name="answer_input_2")

        quest_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="quest_cell")
        answer_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hidden_size, name="answer_cell", reuse=tf.AUTO_REUSE)

        quest_cell = tf.nn.rnn_cell.DropoutWrapper(quest_cell, input_keep_prob=self.keep_prob,
                                                   output_keep_prob=self.keep_prob)
        answer_cell = tf.nn.rnn_cell.DropoutWrapper(answer_cell, input_keep_prob=self.keep_prob,
                                                    output_keep_prob=self.keep_prob)

        q_outputs, q_output_states = tf.nn.dynamic_rnn(cell = quest_cell, inputs=quest_input, dtype=tf.float32)
        a_outputs_1, a_output_states_1 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_1, dtype=tf.float32)
        a_outputs_2, a_output_states_2 = tf.nn.dynamic_rnn(cell = answer_cell, inputs=answer_input_2, dtype=tf.float32)

        q_last_dim = int(q_outputs.get_shape()[-1]) * int(q_outputs.get_shape()[-2])
        q_part = tf.reshape(q_outputs, [-1, q_last_dim])
        q_part = tf.layers.dense(q_part, units=self.hidden_size, name="q_part")
        a_last_dim_1 = int(a_outputs_1.get_shape()[-1]) * int(a_outputs_1.get_shape()[-2])
        a_part_1 = tf.reshape(a_outputs_1, [-1, a_last_dim_1])
        a_part_1 = tf.layers.dense(a_part_1, units=self.hidden_size, name="a_part")
        a_last_dim_2 = int(a_outputs_2.get_shape()[-1]) * int(a_outputs_2.get_shape()[-2])
        a_part_2 = tf.reshape(a_outputs_2, [-1, a_last_dim_2])
        a_part_2 = tf.layers.dense(a_part_2, units=self.hidden_size, name="a_part", reuse=tf.AUTO_REUSE)

        self.qa_score_1 = self.score_func(q_part, a_part_1)
        self.qa_score_2 = self.score_func(q_part, a_part_2)

        self.loss = tf.reduce_mean(tf.nn.relu(1 + self.qa_score_1 - self.qa_score_2))

        self.l2_loss = None
        for train_able_var in tf.trainable_variables():
            if self.l2_loss is None:
                self.l2_loss = tf.nn.l2_loss(train_able_var)
            else:
                self.l2_loss += tf.nn.l2_loss(train_able_var)
        self.loss = self.loss + self.l2_param * self.l2_loss

        self.train_op = tf.train.AdamOptimizer(0.0001).minimize(self.loss)

    @staticmethod
    def pred_test():
        valid_gen = data_gen(type="test")
        pair = Pair()
        saver = tf.train.Saver()
        all_acc_list = []
        with tf.Session(config=tf.ConfigProto(device_count={'gpu':0})) as sess:
            saver.restore(sess, save_path=r"E:\Coding\python\InsurranceQA\sort_model.ckpt")
            times = 0
            while True:
                try:
                    questions, answers_1, answers_2 = valid_gen.__next__()
                except:
                    valid_gen = data_gen(type="test")
                    questions, answers_1, answers_2 = valid_gen.__next__()

                qa_score_1, qa_score_2 = sess.run([pair.qa_score_1, pair.qa_score_2], feed_dict={
                    pair.quest_input: questions,
                    pair.answer_input_1: answers_1,
                    pair.answer_input_2: answers_2,
                    pair.l2_param: 0.0,
                    pair.keep_prob: 1.0
                })

                print("acc :{}".format(np.mean(qa_score_2 > qa_score_1).astype(np.float32)))
                all_acc_list.append(np.mean(qa_score_2 > qa_score_1).astype(np.float32))
                print("avg acc :{}".format(np.mean(all_acc_list)))
                times += 1

    @staticmethod
    def train():
        train_gen = data_gen(type="train")
        valid_gen = data_gen(type="valid")

        pair = Pair()
        saver = tf.train.Saver()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            times = 0
            epochs = 0

            while True:
                try:
                    questions, answers_1, answers_2 = train_gen.__next__()
                except:
                    print("epoch {} end".format(epochs))
                    epochs += 1
                    train_gen = data_gen(type="train")
                    questions, answers_1, answers_2 = train_gen.__next__()

                _, loss = sess.run([pair.train_op ,pair.loss],
                                   feed_dict={
                                       pair.quest_input: questions,
                                       pair.answer_input_1: answers_1,
                                       pair.answer_input_2: answers_2,
                                       pair.l2_param: 0.000001,
                                       pair.keep_prob: 0.7
                                   })

                times += 1
                if times % 5 == 0:
                    print("train loss: {}".format(loss))
                    try:
                        questions, answers_1, answers_2 = valid_gen.__next__()
                    except:
                        valid_gen = data_gen(type="valid")
                        questions, answers_1, answers_2 = valid_gen.__next__()

                    qa_score_1,qa_score_2 ,loss = sess.run([pair.qa_score_1, pair.qa_score_2 ,pair.loss],
                                                           feed_dict={
                                                               pair.quest_input: questions,
                                                               pair.answer_input_1: answers_1,
                                                               pair.answer_input_2: answers_2,
                                                               pair.l2_param: 0.0,
                                                               pair.keep_prob: 1.0
                                                           })
                    print(np.mean(qa_score_2 > qa_score_1).astype(np.float32))

                    print("valid loss: {}".format(loss))
                    saver.save(sess, save_path="E:\Coding\python\InsurranceQA\sort_model.ckpt")

if __name__ == "__main__":
    Pair.pred_test()

这里每一个acc给出正样本得分大于所有负样本得分的比率(可以看作是topK意义上的精度)，avg acc 为累加到现阶段的平均topK精度。

下面是训练一个epoch后pred_test的输出例子：

acc :0.5
avg acc :0.5
acc :0.800000011920929
avg acc :0.6499999761581421
acc :0.8999999761581421
avg acc :0.7333332896232605
acc :1.0
avg acc :0.7999999523162842
acc :0.699999988079071
avg acc :0.7799999713897705
acc :1.0
avg acc :0.8166666030883789
acc :1.0
avg acc :0.8428570628166199
acc :0.6000000238418579
avg acc :0.8125
acc :1.0
avg acc :0.8333333134651184
acc :1.0
avg acc :0.8500000238418579
acc :0.800000011920929
avg acc :0.8454545736312866
acc :1.0
avg acc :0.8583333492279053
acc :1.0
avg acc :0.8692308068275452
acc :0.800000011920929
avg acc :0.8642857670783997
acc :0.8999999761581421
avg acc :0.8666666746139526
acc :0.8999999761581421
avg acc :0.8687499761581421

感兴趣还可以对比参看https://github.com/white127/insuranceQA-cnn-lstm中的top1精度。

QA 问答对排序模型小记

猜你喜欢