Javascript类型推断(3) - 算法模型解析

构建训练模型

上一节我们介绍了生成训练集，测试集，验证集的方法，以及生成词表的方法。
这5个文件构成了训练的基本素材：

files = {
	'train': { 'file': 'data/train.ctf', 'location': 0 },
	'valid': { 'file': 'data/valid.ctf', 'location': 0 },
	'test': { 'file': 'data/test.ctf', 'location': 0 },
	'source': { 'file': 'data/source_wl', 'location': 1 },
	'target': { 'file': 'data/target_wl', 'location': 1 }
}

词表我们需要转换一下格式，放到哈希表里：

# load dictionaries
source_wl = [line.rstrip('\n') for line in open(files['source']['file'])]
target_wl = [line.rstrip('\n') for line in open(files['target']['file'])]
source_dict = {source_wl[i]:i for i in range(len(source_wl))}
target_dict = {target_wl[i]:i for i in range(len(target_wl))}

下面是一些全局参数：

# number of words in vocab, slot labels, and intent labels
vocab_size = len(source_dict)
num_labels = len(target_dict)
epoch_size = 17.955*1000*1000
minibatch_size = 5000
emb_dim = 300
hidden_dim = 650
num_epochs = 10

下面我们定义x,y,t三个值，分别与输入词表、输出标签数和隐藏层有关

# Create the containers for input feature (x) and the label (y)
x = C.sequence.input_variable(vocab_size, name="x")
y = C.sequence.input_variable(num_labels, name="y")
t = C.sequence.input_variable(hidden_dim, name="t")

好，我们开始看下训练的流程：

model = create_model()
enc, dec = model(x, t)
trainer = create_trainer()
train()

训练模型

首先是一个词嵌入层：

def create_model():
	embed = C.layers.Embedding(emb_dim, name='embed')

然后是两个双向的循环神经网络（使用GRU），一个全连接网络，和一个dropout：

	encoder = BiRecurrence(C.layers.GRU(hidden_dim//2), C.layers.GRU(hidden_dim//2))
	recoder = BiRecurrence(C.layers.GRU(hidden_dim//2), C.layers.GRU(hidden_（）dim//2))
	project = C.layers.Dense(num_labels, name='classify')
	do = C.layers.Dropout(0.5)

然后把上面的四项组合起来：

	def recode(x, t):
		inp = embed(x)
		inp = C.layers.LayerNormalization()(inp)
		
		enc = encoder(inp)
		rec = recoder(enc + t)
		proj = project(do(rec))
		
		dec = C.ops.softmax(proj)
		return enc, dec
	return recode

其中双向循环神经网络定义如下：

def BiRecurrence(fwd, bwd):
	F = C.layers.Recurrence(fwd)
	G = C.layers.Recurrence(bwd, go_backwards=True)
	x = C.placeholder()
	apply_x = C.splice(F(x), G(x))
	return apply_x

构建训练过程

首先定义下损失函数，由两部分组成，一部分是loss，另一部分是分类错误：

def criterion(model, labels):
	ce	 = -C.reduce_sum(labels*C.ops.log(model))
	errs = C.classification_error(model, labels)
	return ce, errs

有了损失函数之后，我们使用带动量的Adam算法进行梯度下降训练：

扫描二维码关注公众号，回复： 9321347 查看本文章

def create_trainer():
	masked_dec = dec*C.ops.clip(C.ops.argmax(y), 0, 1)
	loss, label_error = criterion(masked_dec, y)
	loss *= C.ops.clip(C.ops.argmax(y), 0, 1)

	lr_schedule = C.learning_parameter_schedule_per_sample([1e-3]*2 + [5e-4]*2 + [1e-4], epoch_size=int(epoch_size))
	momentum_as_time_constant = C.momentum_as_time_constant_schedule(1000)
	learner = C.adam(parameters=dec.parameters,
						 lr=lr_schedule,
						 momentum=momentum_as_time_constant,
						 gradient_clipping_threshold_per_sample=15, 
						 gradient_clipping_with_truncation=True)

	progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs)
	trainer = C.Trainer(dec, (loss, label_error), learner, progress_printer)
	C.logging.log_number_of_parameters(dec)
	return trainer

训练

定义好模型之后，我们就可以训练了。
首先我们可以利用CNTK.io包的功能定义一个数据的读取器：

def create_reader(path, is_training):
	return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
			source		= C.io.StreamDef(field='S0', shape=vocab_size, is_sparse=True), 
			slot_labels	= C.io.StreamDef(field='S1', shape=num_labels, is_sparse=True)
	)), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

然后我们就可以利用这个读取器读取数据开始训练了：

def train():
	train_reader = create_reader(files['train']['file'], is_training=True)
	step = 0
	pp = C.logging.ProgressPrinter(freq=10, tag='Training')
	for epoch in range(num_epochs):
		epoch_end = (epoch+1) * epoch_size
		while step < epoch_end:
			data = train_reader.next_minibatch(minibatch_size, input_map={
				x: train_reader.streams.source,
				y: train_reader.streams.slot_labels
			})
			# Enhance data
			enhance_data(data, enc)
			# Train model
			trainer.train_minibatch(data)
			pp.update_with_trainer(trainer, with_metric=True)
			step += data[y].num_samples
		pp.epoch_summary(with_metric=True)
		trainer.save_checkpoint("models/model-" + str(epoch + 1) + ".cntk")
		validate()
		evaluate()

上面的代码中，enhance_data需要解释一下。
我们的数据并非是完全线性的数据，还需要进行一个数据增强的处理过程：

def enhance_data(data, enc):
	guesses = enc.eval({x: data[x]})
	inputs = C.ops.argmax(x).eval({x: data[x]})
	tables = []
	for i in range(len(inputs)):
		ts = []
		table = {}
		counts = {}
		for j in range(len(inputs[i])):
			inp = int(inputs[i][j])
			if inp not in table:
				table[inp] = guesses[i][j]
				counts[inp] = 1
			else:
				table[inp] += guesses[i][j]
				counts[inp] += 1
		for inp in table:
			table[inp] /= counts[inp]
		for j in range(len(inputs[i])):
			inp = int(inputs[i][j])
			ts.append(table[inp])
		tables.append(np.array(np.float32(ts)))
	s = C.io.MinibatchSourceFromData(dict(t=(tables, C.layers.typing.Sequence[C.layers.typing.tensor])))
	mems = s.next_minibatch(minibatch_size)
	data[t] = mems[s.streams['t']]

测试和验证

测试和验证的过程中，也需要我们上面介绍的数据增强的过程：

def validate():
	valid_reader = create_reader(files['valid']['file'], is_training=False)
	while True:
		data = valid_reader.next_minibatch(minibatch_size, input_map={
				x: valid_reader.streams.source,
				y: valid_reader.streams.slot_labels
		})
		if not data:
			break
		enhance_data(data, enc)
		trainer.test_minibatch(data)
	trainer.summarize_test_progress()

evaluate与validate逻辑完全一样，只是读取的文件不同：

def evaluate():
	test_reader = create_reader(files['test']['file'], is_training=False)
	while True:
		data = test_reader.next_minibatch(minibatch_size, input_map={
			x: test_reader.streams.source,
			y: test_reader.streams.slot_labels
		})
		if not data:
			break
		# Enhance data
		enhance_data(data, enc)
		# Test model
		trainer.test_minibatch(data)
	trainer.summarize_test_progress()

Jtag特工博客专家

发布了187 篇原创文章 · 获赞 128 · 访问量 34万+

他的留言板关注