对于影评数据,利用句法解析器生成的递归二叉树结构 (Constituency Tree-LSTM),并且在每一个节点打了情感标 签(从负到正0-4五个等级)
模型:TreeLstm
对于上述二叉树每一个节点构造Lstm Cell,
叶节点使用当前词的word-embedding作为序列输入,并将上一步的输入state置为 0(dim is num_units of lstm cell)
内部节点(非叶节点)使用二叉树的左右两个分支的state(h state)作为“上一步”的 信息输入,取代传统Lstm的单侧state,保留了两侧输入的信息,并将当前序列输入置 为0(dim is word-embedding dim)
(这样我们就确定了两种不同输入结构的TreeLstm Cell)
目标:
将各个节点的输出对情感标签进行建模。
TensorFlow_Fold (td)知识点:
一个好的介绍参见:
https://www.leiphone.com/news/201702/cb7cPOtzFj1pgRpk.html
这里仅就一些用到的概念接口简单介绍。
Blocks:
一种对象转换映射,常见功能如进行数据输入(如python内建对象对td内对象的映 射:td.Scalar td.Tensor等)、模型构建(td.Function包裹的tf ops、td.FC:全连接 神经层、由td.ScopedLayer 包裹的tf.contrib.rnn.RNNCell、td.Embedding等)
Layers:
td.FC、td.ScopedLayer 包裹的tf.contrib.rnn.RNNCell属于Layers的实现,其 引入是为了在不同输入结构的神经层的实例间实现变量共享(对应tf的variable scope,如对上述TreeLstm Cell的两种输入结构)
其他一些见下面简单例子:
简单例子:
import tensorflow as tf import tensorflow_fold as td # td.Record 用于进行序列或集合输入 # td.Composition 可以在内部定义多个blocks的组合,固定使用reads进行不同blocks # 的连接, # td.Metric 一般用于在优化流程中得到一些需要的距离(如loss) # 在td.Compiler 中可以通过 compiler.metric_tensors方法得到 在流程 # 中定义的loss # see >> you may familiar with scala "=>" or java "->" def basic_eval(): # used for eval sess = tf.InteractiveSession() full_block_0 = td.Record((td.Map(td.Scalar()),)) print(full_block_0.eval(([1, 2],))) full_block_1 = td.Record((td.Vector(size=1) ,)) print(full_block_1.eval(([1],))) full_block_2 = td.Record((td.Map(td.Scalar()), td.Map(td.Scalar()))) print(full_block_2.eval(([1, 2, 3], [4, 5, 6]))) y = td.Composition() with y.scope(): label = y.input[0] prediction = y.input[1] l2 = (td.Function(tf.subtract)).reads(label, prediction) td.Metric('loss').reads(l2) y.output.reads(prediction) c0 = td.Record((td.Scalar(), td.Scalar())) >> y print(c0.eval((0, 1))) # TensorFlow_Fold 有很强的函数式编程风格 Do You Love It? # td.OneOf 可以进行pre_block设置,在数据流程进入case_blocks # 之前运行,一般常做读取数据用 # td.AllOf dual to "map' meaning def functional_eval(): sess = tf.InteractiveSession() # oneof not identical to ?: in java, reverse b0 = td.OneOf( key_fn=lambda x: x > 0, case_blocks=(td.Scalar() >> td.Function(tf.abs), td.Scalar()) ) print(b0.eval(-10)) b1 = td.OneOf( key_fn=lambda x: x[0]>0, case_blocks= (td.Function(tf.abs), td.Identity()), pre_block= td.Map(td.Scalar()) >> td.GetItem(0) ) print(b1.eval([-10])) b2 = td.OneOf( key_fn=lambda x: x[0]>0, case_blocks=(td.Function(tf.subtract), td.Function(tf.add)), pre_block= (td.Scalar(), td.Scalar()) ) print(b2.eval((-10, -10))) from functools import partial b3 = td.Scalar() >> td.AllOf( td.Function(tf.abs), td.Function(partial(tf.multiply, -1.0)) ) print(b3.eval(-1)) # td.ForwardDeclaration 用于定义case递归结构对应的 计算流程 # 使用 resolve_to包裹生效 # think of case class in scala # example # http://docs.scala-lang.org/tutorials/scala-for-java-programmers.html # Case Classes and Pattern Matching # 为处理 TreeLstm 结构创造可能 # td.ForwardDeclaration 输入输出类型可以省略 def forwordDeclarationTest(): sess = tf.InteractiveSession() from tensorflow_fold.blocks import result_types as tdt expr_fwd = td.ForwardDeclaration(tdt.PyObjectType(), tdt.TensorType(shape=()), name="tfd") lit_case = td.GetItem('val') >> td.Scalar() add_case = (td.Record({'left': expr_fwd(), 'right': expr_fwd()}) >> td.Function(tf.add)) expr = td.OneOf(lambda x: x['op'], {'lit': lit_case, 'add': add_case}) expr_fwd.resolve_to(expr) expr0 = {"op": "lit", "val": 100.0} expr1 = {"op": "lit", "val": 20.0} expr2 = {"op": "add", "left": expr0, "right": expr1} print(expr.eval(expr0)) print(expr.eval(expr2)) forwordDeclarationTest() # 非可微函序列数优化例子(摘自官网) # td.Reduce 接受序列(td.Map(..)) or 元组转化后的对象并统一处理 # 为二元元祖用于递归调用,故可接td.Concat (元组输入),这个设计是一般的 # td.Function转化的对象都接受元组输入 # 这里的 net_block 使用递归的格式定义来逼近递归定义函数(sum 等)是“合理”的 # 递归定义优化的一个问题是序列的不定长,这也是td的一个目的与要点。 # td.Compiler 定义优化的 block 输入输出格式 import random def non_diff_seq_opt(): sess = tf.InteractiveSession() def reduce_net_block(): net_block = td.Concat() >> td.FC(20) >> td.FC(1, activation=None) >> \ td.Function(lambda xs: tf.squeeze(xs, axis=1)) return td.Map(td.Scalar()) >> td.Reduce(net_block) def random_example(fn): length = random.randrange(1, 10) data = [random.uniform(0, 1) for _ in range(length)] result = fn(data) return data, result # the change of this is the input shape can change, and the object func # can be not differentiated def train(fn, batch_size = 100): net_block = reduce_net_block() compiler = td.Compiler.create((net_block, td.Scalar())) y, y_ = compiler.output_tensors loss = tf.nn.l2_loss(y - y_) train = tf.train.AdamOptimizer().minimize(loss) sess.run(tf.global_variables_initializer()) validation_fd = compiler.build_feed_dict(random_example(fn) for _ in range(1000)) for i in range(2000): sess.run(train, compiler.build_feed_dict(random_example(fn) for _ in range(batch_size))) if i % 100 == 0: print(i, sess.run(loss, validation_fd)) return net_block sum_block = train(sum) print(sum_block.eval([10, 20, 30])) if __name__ == "__main__": basic_eval() functional_eval()
有了上述知识准备,就可以看一下TreeLstm对于情感分析问题的实现了
TreeLstm例子:(摘自官网)
#file path: #http://nlp.stanford.edu/data/glove.840B.300d.zip #http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip #http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip import codecs import os from nltk.tokenize import sexpr import numpy as np import tensorflow as tf import tensorflow_fold as td # global settings sess = tf.InteractiveSession() # 数据准备 # 使用 glove embedding data_dir = "data/" def load_data(): full_glove_path = "train_req_data/glove.840B.300d.txt" train_path, dev_path, test_path = "train_req_data/trees/train.txt", \ "train_req_data/trees/dev.txt", \ "train_req_data/trees/test.txt" filtered_glove_path = os.path.join(data_dir, "filtered_glove.txt") def filter_glove(): vocab = set() sentence_path = "train_req_data/stanfordSentimentTreebank/SOStr.txt" with codecs.open(sentence_path, encoding="utf-8") as f: for line in f: vocab.update(line.strip().replace('\\', '').split('|')) nread = 0 nwrote = 0 with codecs.open(full_glove_path, encoding="utf-8") as f: with codecs.open(filtered_glove_path, "w", encoding="utf-8") as out: for line in f: nread += 1 line = line.strip() if not line: continue if line.split(u" ", 1)[0] in vocab: out.write(line + "\n") nwrote += 1 print("read %s lines, wrote %s" % (nread, nwrote)) def load_embeddings(embedding_path): print("loading word embedding from %s" % embedding_path) weight_vectors = [] word_idx = {} with codecs.open(embedding_path, encoding="utf-8") as f: for line in f: word, vec = line.split(u' ', 1) word_idx[word] = len(weight_vectors) weight_vectors.append(np.array(vec.split(), dtype=np.float32)) word_idx[u'-LRB-'] = word_idx.pop(u'(') word_idx[u'-RRB-'] = word_idx.pop(u')') weight_vectors.append( np.random.uniform( -0.05, 0.05, weight_vectors[0].shape ).astype(np.float32) ) # np.stack auto rise dim by one(transform list of vectors to array) # embed dim [buckets_num, embed_dim] return np.stack(weight_vectors), word_idx def load_trees(filename): with codecs.open(filename, encoding="utf-8") as f: trees = [line.strip().replace('\\', '') for line in f] print("loaded %s trees from %s" % (len(trees), filename)) return trees filter_glove() weight_matrix, word_idx = load_embeddings(filtered_glove_path) train_trees = load_trees(train_path) dev_trees = load_trees(dev_path) test_trees = load_trees(test_path) import pickle print("start_dump") with open("data/data.pkl", "wb") as f: pickle.dump( { "weight_matrix": weight_matrix, "word_idx": word_idx, "train_trees": train_trees, "dev_trees": dev_trees, "test_trees": test_trees } ,f ) print("dump end") # 定义 TreeLstm Cell # 各种RNN实现 基本上是通过重载 _RNNCell 的 __call__ 方法给出的, # 只需要制定 state output 的嵌套输入输出格式即可,这里仅仅是对 BasicLSTMCell # 中 h state 及相应的 forget-gate 进行数量扩展 class BinaryTreeLSTMCell(tf.contrib.rnn.BasicLSTMCell): def __init__(self, num_units, keep_prob = 1.0): super(BinaryTreeLSTMCell, self).__init__(num_units) self._keep_prob = keep_prob def __call__(self, inputs, state, scope = None): with tf.variable_scope( scope or type(self).__name__ ): lhs, rhs = state c0, h0 = lhs c1, h1 = rhs concat = tf.contrib.layers.linear(tf.concat([inputs, h0, h1], 1), 5 * self._num_units) i, j, f0, f1, o = tf.split(value = concat, num_or_size_splits=5, axis = 1) j = self._activation(j) if not isinstance(self._keep_prob, float ) or self._keep_prob < 1: j = tf.nn.dropout(j, self._keep_prob) new_c = (c0 * tf.sigmoid(f0 + self._forget_bias) + c1 * tf.sigmoid(f1 + self._forget_bias) + tf.sigmoid(i) *j) new_h = self._activation(new_c) * tf.sigmoid(o) new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) return new_h, new_state def construct_treelstm(): import pickle with open("data/data.pkl", "rb") as f: data_construct = pickle.load(f) weight_matrix = data_construct["weight_matrix"] word_idx = data_construct["word_idx"] train_trees = data_construct["train_trees"] dev_trees = data_construct["dev_trees"] test_trees = data_construct["test_trees"] print("load end") # model construct keep_prob_ph = tf.placeholder_with_default(1.0, []) lstm_num_units = 300 # td.ScopedLayer 在前面 td.Layer 中已提到,控制变量作用域 tree_lstm = td.ScopedLayer( tf.contrib.rnn.DropoutWrapper( BinaryTreeLSTMCell(lstm_num_units, keep_prob = keep_prob_ph), input_keep_prob = keep_prob_ph, output_keep_prob = keep_prob_ph, ), name_or_scope="tree_lstm" ) # num of distinct sentiment labels NUM_CLASSES = 5 # linear activate func. 5类情感标签 output_layer = td.FC(NUM_CLASSES, activation=None, name="output_layer") # td.Embedding not partitioned by hash # 由于td.Embedding 的这种简单的结构 使得在进行embed 构造时不能维数太大 # 当初始化矩阵大于2g时会报错,比如不使用 filter_glove 过滤的embed 就会诱发 word_embedding = td.Embedding( *weight_matrix.shape, initializer=weight_matrix, name = "word_embedding" ) # tree construct declaration embed_subtree = td.ForwardDeclaration(name = "embed_subtree") def logits_and_state(): unknown_idx = len(word_idx) lookup_word = lambda word: word_idx.get(word, unknown_idx) word2vec = (td.GetItem(0) >> td.InputTransform(lookup_word) >> td.Scalar("int32") >> word_embedding) pair2vec = (embed_subtree(), embed_subtree()) zero_state = td.Zeros((tree_lstm.state_size,) * 2) zero_inp = td.Zeros(word_embedding.output_type.shape[0]) word_case = td.AllOf(word2vec, zero_state) pair_case = td.AllOf(zero_inp, pair2vec) tree2vec = td.OneOf(len, [(1, word_case), (2, pair_case)]) return tree2vec >> tree_lstm >> (output_layer, td.Identity()) def tf_node_loss(logits, labels): return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels) # every label def tf_fine_grained_hits(logits, labels): predictions = tf.cast(tf.argmax(logits, 1), tf.int32) return tf.cast(tf.equal(predictions, labels), tf.float64) # two category label def tf_binary_hits(logits, labels): softmax = tf.nn.softmax(logits) binary_predictions = (softmax[:, 3] + softmax[:, 4]) > (softmax[:, 0] + softmax[:, 1]) binary_labels = labels > 2 return tf.cast(tf.equal(binary_predictions, binary_labels), tf.float64) def add_metrics(is_root, is_neutral): c = td.Composition( name = "predict(is_root=%s, is_neutral=%s)" % (is_root, is_neutral) ) with c.scope(): labels = c.input[0] logits = td.GetItem(0).reads(c.input[1]) state = td.GetItem(1).reads(c.input[1]) loss = td.Function(tf_node_loss) td.Metric("all_loss").reads(loss.reads(logits, labels)) if is_root: td.Metric("root_loss").reads(loss) hits = td.Function(tf_fine_grained_hits) td.Metric("all_hits").reads(hits.reads(logits, labels)) if is_root :td.Metric("root_hits").reads(hits) # only calculate not neutral label the binary_hits if not is_neutral: binary_hits = td.Function(tf_binary_hits).reads(logits, labels) td.Metric("all_binary_hits").reads(binary_hits) if is_root: td.Metric("root_binary_hits").reads(binary_hits) c.output.reads(state) return c # 从单条 tree struct record 中分解出 子树 def tokenize(s): label, phrase = s[1:-1].split(None, 1) return label, sexpr.sexpr_tokenize(phrase) def embed_tree(logits_and_state, is_root): return td.InputTransform(tokenize) >> td.OneOf( key_fn=lambda pair: pair[0] == "2", case_blocks=( add_metrics(is_root, is_neutral=False), add_metrics(is_root, is_neutral=True) ), pre_block=(td.Scalar("int32"), logits_and_state) ) model = embed_tree(logits_and_state(), is_root=True) embed_subtree.resolve_to(embed_tree(logits_and_state(), is_root=False)) compiler = td.Compiler.create(model) print("input type: %s" % model.input_type) print("output type: %s" % model.output_type) # setup for training metrics = {k: tf.reduce_mean(v) for k, v in compiler.metric_tensors.items()} LEARNING_RATE = 0.05 KEEP_PROB = 0.75 BATCH_SIZE = 100 EPOCHS = 20 # downscale for prevent overfitting EMBEDDING_LEARNING_RATE_FACTOR = 0.1 train_feed_dict = {keep_prob_ph: KEEP_PROB} loss = tf.reduce_sum(compiler.metric_tensors["all_loss"]) opt = tf.train.AdagradOptimizer(LEARNING_RATE) grads_and_vars = opt.compute_gradients(loss) found = 0 for i, (grad, var) in enumerate(grads_and_vars): # word_embedding weights is the var create for embed if var == word_embedding.weights: found += 1 grad = tf.scalar_mul(EMBEDDING_LEARNING_RATE_FACTOR, grad) grads_and_vars[i] = (grad, var) assert found == 1 train = opt.apply_gradients(grads_and_vars) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) def train_step(batch): train_feed_dict[compiler.loom_input_tensor] = batch _, batch_loss = sess.run([train, loss], train_feed_dict) return batch_loss def train_epoch(train_set): return sum(train_step(batch) for batch in td.group_by_batches(train_set, BATCH_SIZE)) train_set = compiler.build_loom_inputs(train_trees) dev_feed_dict = compiler.build_feed_dict(dev_trees) def dev_eval(epoch, train_loss): dev_metrics = sess.run(metrics, dev_feed_dict) dev_loss = dev_metrics["all_loss"] dev_accuracy = [ "%s: %.2f" % (k, v * 100) for k, v in sorted(dev_metrics.items()) if k.endswith("hits") ] print('epoch:%4d, train_loss: %.3e, dev_loss_avg: %.3e, dev_accuracy:\n [%s]' % (epoch, train_loss, dev_loss, ' '.join(dev_accuracy))) return dev_metrics['root_hits'] best_accuracy = 0.0 save_path = os.path.join(data_dir, 'sentiment_model') for epoch, shuffled in enumerate(td.epochs(train_set, EPOCHS), 1): train_loss = train_epoch(shuffled) accuracy = dev_eval(epoch, train_loss) if accuracy > best_accuracy: best_accuracy = accuracy checkpoint_path = saver.save(sess, save_path, global_step=epoch) print('model saved in file: %s' % checkpoint_path) if __name__ == "__main__": load_data() construct_treelstm()