TensorFlow Fold 初探（一）——TreeLstm情感分类

论文链接：

https://arxiv.org/pdf/1503.00075.pdf

数据集：Standford Sentiment Treebank

对于影评数据，利用句法解析器生成的递归二叉树结构 (Constituency Tree-LSTM)，并且在每一个节点打了情感标签（从负到正0-4五个等级）

模型：TreeLstm

对于上述二叉树每一个节点构造Lstm Cell,

叶节点使用当前词的word-embedding作为序列输入，并将上一步的输入state置为 0(dim is num_units of lstm cell)

内部节点（非叶节点）使用二叉树的左右两个分支的state(h state)作为“上一步”的信息输入,取代传统Lstm的单侧state,保留了两侧输入的信息，并将当前序列输入置为0(dim is word-embedding dim)

(这样我们就确定了两种不同输入结构的TreeLstm Cell)

目标：

将各个节点的输出对情感标签进行建模。

TensorFlow_Fold (td)知识点：

一个好的介绍参见：

https://www.leiphone.com/news/201702/cb7cPOtzFj1pgRpk.html

这里仅就一些用到的概念接口简单介绍。

Blocks:

一种对象转换映射，常见功能如进行数据输入（如python内建对象对td内对象的映射：td.Scalar td.Tensor等）、模型构建（td.Function包裹的tf ops、td.FC:全连接神经层、由td.ScopedLayer 包裹的tf.contrib.rnn.RNNCell、td.Embedding等）

Layers:

td.FC、td.ScopedLayer 包裹的tf.contrib.rnn.RNNCell属于Layers的实现，其引入是为了在不同输入结构的神经层的实例间实现变量共享（对应tf的variable scope，如对上述TreeLstm Cell的两种输入结构）

其他一些见下面简单例子：

简单例子:

import tensorflow as tf
import tensorflow_fold as td

# td.Record 用于进行序列或集合输入
# td.Composition 可以在内部定义多个blocks的组合，固定使用reads进行不同blocks
#  的连接，
# td.Metric 一般用于在优化流程中得到一些需要的距离（如loss）
# 在td.Compiler 中可以通过 compiler.metric_tensors方法得到 在流程
# 中定义的loss

# see >> you may familiar with scala "=>" or java "->"
def basic_eval():

    # used for eval
    sess = tf.InteractiveSession()

    full_block_0 = td.Record((td.Map(td.Scalar()),))
    print(full_block_0.eval(([1, 2],)))

    full_block_1 = td.Record((td.Vector(size=1) ,))
    print(full_block_1.eval(([1],)))

    full_block_2 = td.Record((td.Map(td.Scalar()), td.Map(td.Scalar())))
    print(full_block_2.eval(([1, 2, 3], [4, 5, 6])))

    y = td.Composition()
    with y.scope():
        label = y.input[0]
        prediction = y.input[1]
        l2 = (td.Function(tf.subtract)).reads(label, prediction)
        td.Metric('loss').reads(l2)
        y.output.reads(prediction)

    c0 = td.Record((td.Scalar(), td.Scalar())) >> y

    print(c0.eval((0, 1)))


# TensorFlow_Fold 有很强的函数式编程风格 Do You Love It?

# td.OneOf 可以进行pre_block设置，在数据流程进入case_blocks
# 之前运行，一般常做读取数据用

# td.AllOf dual to "map' meaning
def functional_eval():
    sess = tf.InteractiveSession()

    # oneof not identical to ?: in java, reverse
    b0 = td.OneOf(
        key_fn=lambda x: x > 0,
        case_blocks=(td.Scalar() >> td.Function(tf.abs), td.Scalar())
    )

    print(b0.eval(-10))

    b1 = td.OneOf(
        key_fn=lambda x: x[0]>0,
        case_blocks= (td.Function(tf.abs), td.Identity()),
        pre_block= td.Map(td.Scalar()) >> td.GetItem(0)
    )

    print(b1.eval([-10]))

    b2 = td.OneOf(
        key_fn=lambda x: x[0]>0,
        case_blocks=(td.Function(tf.subtract), td.Function(tf.add)),
        pre_block= (td.Scalar(), td.Scalar())
    )

    print(b2.eval((-10, -10)))

    from functools import partial
    b3 = td.Scalar() >> td.AllOf(
        td.Function(tf.abs), td.Function(partial(tf.multiply, -1.0))
    )

    print(b3.eval(-1))

    # td.ForwardDeclaration 用于定义case递归结构对应的 计算流程
    # 使用 resolve_to包裹生效
    # think of case class in scala
    # example
    # http://docs.scala-lang.org/tutorials/scala-for-java-programmers.html
    # Case Classes and Pattern Matching
    # 为处理 TreeLstm 结构创造可能
    # td.ForwardDeclaration 输入输出类型可以省略
    def forwordDeclarationTest():
        sess = tf.InteractiveSession()
        from tensorflow_fold.blocks import result_types as tdt
        expr_fwd = td.ForwardDeclaration(tdt.PyObjectType(), tdt.TensorType(shape=()), name="tfd")

        lit_case = td.GetItem('val') >> td.Scalar()
        add_case = (td.Record({'left': expr_fwd(), 'right': expr_fwd()}) >>
                    td.Function(tf.add))
        expr = td.OneOf(lambda x: x['op'], {'lit': lit_case, 'add': add_case})

        expr_fwd.resolve_to(expr)

        expr0 = {"op": "lit", "val": 100.0}
        expr1 = {"op": "lit", "val": 20.0}
        expr2 = {"op": "add", "left": expr0, "right": expr1}

        print(expr.eval(expr0))
        print(expr.eval(expr2))

    forwordDeclarationTest()

    # 非可微函序列数优化例子（摘自官网）
    # td.Reduce 接受序列(td.Map(..)) or 元组转化后的对象并统一处理
    # 为二元元祖用于递归调用，故可接td.Concat （元组输入），这个设计是一般的
    # td.Function转化的对象都接受元组输入

    # 这里的 net_block 使用递归的格式定义来逼近递归定义函数(sum 等)是“合理”的
    # 递归定义优化的一个问题是序列的不定长，这也是td的一个目的与要点。

    # td.Compiler 定义优化的 block 输入输出格式
    import random
    def non_diff_seq_opt():
        sess = tf.InteractiveSession()

        def reduce_net_block():
            net_block = td.Concat() >> td.FC(20) >> td.FC(1, activation=None) >> \
                        td.Function(lambda xs: tf.squeeze(xs, axis=1))

            return td.Map(td.Scalar()) >> td.Reduce(net_block)

        def random_example(fn):
            length = random.randrange(1, 10)
            data = [random.uniform(0, 1) for _ in range(length)]
            result = fn(data)
            return data, result

        # the change of this is the input shape can change, and the object func
        # can be not differentiated
        def train(fn, batch_size = 100):
            net_block = reduce_net_block()
            compiler = td.Compiler.create((net_block, td.Scalar()))
            y, y_ = compiler.output_tensors
            loss = tf.nn.l2_loss(y - y_)
            train = tf.train.AdamOptimizer().minimize(loss)
            sess.run(tf.global_variables_initializer())
            validation_fd = compiler.build_feed_dict(random_example(fn) for _ in range(1000))
            for i in range(2000):
                sess.run(train, compiler.build_feed_dict(random_example(fn) for _ in range(batch_size)))
                if i % 100 == 0:
                    print(i, sess.run(loss, validation_fd))
            return net_block

        sum_block = train(sum)
        print(sum_block.eval([10, 20, 30]))

if __name__ == "__main__":
    basic_eval()
    functional_eval()

有了上述知识准备，就可以看一下TreeLstm对于情感分析问题的实现了

TreeLstm例子：（摘自官网）

#file path:
#http://nlp.stanford.edu/data/glove.840B.300d.zip
#http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
#http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

import codecs
import os

from nltk.tokenize import sexpr
import numpy as np
import tensorflow as tf
import tensorflow_fold as td

# global settings
sess = tf.InteractiveSession()

# 数据准备
# 使用 glove embedding
data_dir = "data/"
def load_data():
    full_glove_path = "train_req_data/glove.840B.300d.txt"
    train_path, dev_path, test_path = "train_req_data/trees/train.txt", \
                                      "train_req_data/trees/dev.txt", \
                                      "train_req_data/trees/test.txt"

    filtered_glove_path = os.path.join(data_dir, "filtered_glove.txt")

    def filter_glove():
        vocab = set()
        sentence_path = "train_req_data/stanfordSentimentTreebank/SOStr.txt"
        with codecs.open(sentence_path, encoding="utf-8") as f:
            for line in f:
                vocab.update(line.strip().replace('\\', '').split('|'))
        nread = 0
        nwrote = 0
        with codecs.open(full_glove_path, encoding="utf-8") as f:
            with codecs.open(filtered_glove_path, "w", encoding="utf-8") as out:
                for line in f:
                    nread += 1
                    line = line.strip()
                    if not line: continue
                    if line.split(u" ", 1)[0] in vocab:
                        out.write(line + "\n")
                        nwrote += 1
        print("read %s lines, wrote %s" % (nread, nwrote))

    def load_embeddings(embedding_path):
        print("loading word embedding from %s" % embedding_path)
        weight_vectors = []
        word_idx = {}
        with codecs.open(embedding_path, encoding="utf-8") as f:
            for line in f:
                word, vec = line.split(u' ', 1)
                word_idx[word] = len(weight_vectors)
                weight_vectors.append(np.array(vec.split(), dtype=np.float32))

        word_idx[u'-LRB-'] = word_idx.pop(u'(')
        word_idx[u'-RRB-'] = word_idx.pop(u')')
        weight_vectors.append(
            np.random.uniform(
                -0.05, 0.05, weight_vectors[0].shape
            ).astype(np.float32)
        )
        # np.stack auto rise dim by one(transform list of vectors to array)
        # embed dim [buckets_num, embed_dim]
        return np.stack(weight_vectors), word_idx

    def load_trees(filename):
        with codecs.open(filename, encoding="utf-8") as f:
            trees = [line.strip().replace('\\', '') for line in f]
        print("loaded %s trees from %s" % (len(trees), filename))
        return trees


    filter_glove()
    weight_matrix, word_idx = load_embeddings(filtered_glove_path)
    train_trees = load_trees(train_path)
    dev_trees = load_trees(dev_path)
    test_trees = load_trees(test_path)

    import pickle
    print("start_dump")
    with open("data/data.pkl", "wb") as f:
        pickle.dump(
            {
                "weight_matrix": weight_matrix,
                "word_idx": word_idx,
                "train_trees": train_trees,
                "dev_trees": dev_trees,
                "test_trees": test_trees
            }
            ,f
        )
    print("dump end")

# 定义 TreeLstm Cell
# 各种RNN实现 基本上是通过重载 _RNNCell 的 __call__ 方法给出的，
# 只需要制定 state output 的嵌套输入输出格式即可，这里仅仅是对 BasicLSTMCell
# 中 h state 及相应的 forget-gate 进行数量扩展
class BinaryTreeLSTMCell(tf.contrib.rnn.BasicLSTMCell):
    def __init__(self, num_units, keep_prob = 1.0):
        super(BinaryTreeLSTMCell, self).__init__(num_units)
        self._keep_prob = keep_prob

    def __call__(self, inputs, state, scope = None):
        with tf.variable_scope(
                scope or type(self).__name__
        ):
            lhs, rhs = state
            c0, h0 = lhs
            c1, h1 = rhs
            concat = tf.contrib.layers.linear(tf.concat([inputs, h0, h1], 1), 5 * self._num_units)
            i, j, f0, f1, o = tf.split(value = concat, num_or_size_splits=5, axis = 1)

            j = self._activation(j)
            if not isinstance(self._keep_prob, float                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ) or self._keep_prob < 1:
                j = tf.nn.dropout(j, self._keep_prob)

            new_c = (c0 * tf.sigmoid(f0 + self._forget_bias) + c1 * tf.sigmoid(f1 + self._forget_bias) + tf.sigmoid(i) *j)
            new_h = self._activation(new_c) * tf.sigmoid(o)

            new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)

            return new_h, new_state

def construct_treelstm():
    import pickle
    with open("data/data.pkl", "rb") as f:
        data_construct = pickle.load(f)
        weight_matrix = data_construct["weight_matrix"]
        word_idx = data_construct["word_idx"]
        train_trees = data_construct["train_trees"]
        dev_trees = data_construct["dev_trees"]
        test_trees = data_construct["test_trees"]
    print("load end")

    # model construct
    keep_prob_ph = tf.placeholder_with_default(1.0, [])
    lstm_num_units = 300

    # td.ScopedLayer 在前面 td.Layer 中已提到，控制变量作用域
    tree_lstm = td.ScopedLayer(
        tf.contrib.rnn.DropoutWrapper(
            BinaryTreeLSTMCell(lstm_num_units, keep_prob = keep_prob_ph),
            input_keep_prob = keep_prob_ph, output_keep_prob = keep_prob_ph,
        ),
        name_or_scope="tree_lstm"
    )
    # num of distinct sentiment labels
    NUM_CLASSES = 5

    # linear activate func. 5类情感标签
    output_layer = td.FC(NUM_CLASSES, activation=None, name="output_layer")

    # td.Embedding not partitioned by hash
    # 由于td.Embedding 的这种简单的结构 使得在进行embed 构造时不能维数太大
    # 当初始化矩阵大于2g时会报错，比如不使用 filter_glove 过滤的embed 就会诱发
    word_embedding = td.Embedding(
        *weight_matrix.shape, initializer=weight_matrix, name = "word_embedding"
    )

    # tree construct declaration
    embed_subtree = td.ForwardDeclaration(name = "embed_subtree")

    def logits_and_state():
        unknown_idx = len(word_idx)
        lookup_word = lambda word: word_idx.get(word, unknown_idx)

        word2vec = (td.GetItem(0) >> td.InputTransform(lookup_word) >> td.Scalar("int32")
                    >> word_embedding)
        pair2vec = (embed_subtree(), embed_subtree())

        zero_state = td.Zeros((tree_lstm.state_size,) * 2)
        zero_inp = td.Zeros(word_embedding.output_type.shape[0])

        word_case = td.AllOf(word2vec, zero_state)
        pair_case = td.AllOf(zero_inp, pair2vec)

        tree2vec = td.OneOf(len, [(1, word_case), (2, pair_case)])

        return tree2vec >> tree_lstm >> (output_layer, td.Identity())

    def tf_node_loss(logits, labels):
        return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)

    # every label
    def tf_fine_grained_hits(logits, labels):
        predictions = tf.cast(tf.argmax(logits, 1), tf.int32)
        return tf.cast(tf.equal(predictions, labels), tf.float64)

    # two category label
    def tf_binary_hits(logits, labels):
        softmax = tf.nn.softmax(logits)
        binary_predictions = (softmax[:, 3] + softmax[:, 4]) > (softmax[:, 0] + softmax[:, 1])
        binary_labels = labels > 2
        return tf.cast(tf.equal(binary_predictions, binary_labels), tf.float64)

    def add_metrics(is_root, is_neutral):
        c = td.Composition(
            name = "predict(is_root=%s, is_neutral=%s)" % (is_root, is_neutral)
        )
        with c.scope():
            labels = c.input[0]
            logits = td.GetItem(0).reads(c.input[1])
            state = td.GetItem(1).reads(c.input[1])

            loss = td.Function(tf_node_loss)
            td.Metric("all_loss").reads(loss.reads(logits, labels))
            if is_root: td.Metric("root_loss").reads(loss)

            hits = td.Function(tf_fine_grained_hits)
            td.Metric("all_hits").reads(hits.reads(logits, labels))
            if is_root :td.Metric("root_hits").reads(hits)

            # only calculate not neutral label the binary_hits
            if not is_neutral:
                binary_hits = td.Function(tf_binary_hits).reads(logits, labels)
                td.Metric("all_binary_hits").reads(binary_hits)
                if is_root: td.Metric("root_binary_hits").reads(binary_hits)

            c.output.reads(state)
        return c

    # 从单条 tree struct record 中分解出 子树
    def tokenize(s):
        label, phrase = s[1:-1].split(None, 1)
        return label, sexpr.sexpr_tokenize(phrase)

    def embed_tree(logits_and_state, is_root):
        return td.InputTransform(tokenize) >> td.OneOf(
            key_fn=lambda pair: pair[0] == "2",
            case_blocks=(
                add_metrics(is_root, is_neutral=False),
                add_metrics(is_root, is_neutral=True)
            ),
            pre_block=(td.Scalar("int32"), logits_and_state)
        )

    model = embed_tree(logits_and_state(), is_root=True)
    embed_subtree.resolve_to(embed_tree(logits_and_state(), is_root=False))

    compiler = td.Compiler.create(model)
    print("input type: %s" % model.input_type)
    print("output type: %s" % model.output_type)

    # setup for training
    metrics = {k: tf.reduce_mean(v) for k, v in compiler.metric_tensors.items()}

    LEARNING_RATE = 0.05
    KEEP_PROB = 0.75
    BATCH_SIZE = 100
    EPOCHS = 20

    # downscale for prevent overfitting
    EMBEDDING_LEARNING_RATE_FACTOR = 0.1

    train_feed_dict = {keep_prob_ph: KEEP_PROB}
    loss = tf.reduce_sum(compiler.metric_tensors["all_loss"])
    opt = tf.train.AdagradOptimizer(LEARNING_RATE)

    grads_and_vars = opt.compute_gradients(loss)
    found = 0
    for i, (grad, var) in enumerate(grads_and_vars):
        # word_embedding weights is the var create for embed
        if var == word_embedding.weights:
            found += 1
            grad = tf.scalar_mul(EMBEDDING_LEARNING_RATE_FACTOR, grad)
            grads_and_vars[i] = (grad, var)
    assert found == 1
    train = opt.apply_gradients(grads_and_vars)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())

    def train_step(batch):
        train_feed_dict[compiler.loom_input_tensor] = batch
        _, batch_loss = sess.run([train, loss], train_feed_dict)
        return batch_loss

    def train_epoch(train_set):
        return sum(train_step(batch) for batch in td.group_by_batches(train_set, BATCH_SIZE))

    train_set = compiler.build_loom_inputs(train_trees)
    dev_feed_dict = compiler.build_feed_dict(dev_trees)

    def dev_eval(epoch, train_loss):
        dev_metrics = sess.run(metrics, dev_feed_dict)
        dev_loss = dev_metrics["all_loss"]
        dev_accuracy = [
            "%s: %.2f" % (k, v * 100) for k, v in
            sorted(dev_metrics.items()) if k.endswith("hits")
        ]
        print('epoch:%4d, train_loss: %.3e, dev_loss_avg: %.3e, dev_accuracy:\n  [%s]'
              % (epoch, train_loss, dev_loss, ' '.join(dev_accuracy)))
        return dev_metrics['root_hits']

    best_accuracy = 0.0
    save_path = os.path.join(data_dir, 'sentiment_model')
    for epoch, shuffled in enumerate(td.epochs(train_set, EPOCHS), 1):
        train_loss = train_epoch(shuffled)
        accuracy = dev_eval(epoch, train_loss)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            checkpoint_path = saver.save(sess, save_path, global_step=epoch)
            print('model saved in file: %s' % checkpoint_path)



if __name__ == "__main__":
    load_data()
    construct_treelstm()

TensorFlow Fold 初探（一）——TreeLstm情感分类

猜你喜欢