skift: 用于Python fastText的scikit-learn 包装器
什么是 skift?
skift包括几个scikit-learn兼容包装器,里面封装了fasttext模型,fasttext原理类似于word2vec,主要用于文本快速分类。其优势在于分类速度快,使用n-gram特征容易获得文本句子局部信息、构造新词。缺点是随着语料的增长,内存需求也会增长。那么如果解决内存问题呢?fasttext这种提出三种解决方法,包括
- 过滤掉出现次数少的词;
- 使用Hash存储
- 采用word粒度,而非char粒度
例如句子: 我喜欢去中国, 如果采用char粒度,则使用2-gram的话,产生的特征为
我喜 喜欢 欢中 中国
如果采用word粒度的话,产生的特征为
我喜欢 喜欢去 去中国
关于fasttext原理比较好的参考有FastText文本分类算法学习笔记和FastText的内部机制,这里不详阐述。
下面使用skift实现faxtText来对细粒度情感分析模板
from tqdm import tqdm
from skift import FirstColFtClassifier
from sklearn.model_selection import KFold
import numpy as np
import os
import pickle
class BasicModel(object):
"""Docstring for BasicModel. """
def __init__(self):
"""TODO: to be defined1. """
pass
def create_model(self, kfold_X_train, y_train, kfold_X_test, y_test, test):
pass
# Generate batches
def batch_iter(self, data, batch_size, num_epochs=1, shuffle=True):
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
for epoch in range(num_epochs):
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((1 + batch_num) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
def get_f1_score(self, x, y, verbose=False):
tp = np.sum(np.logical_and(y > 0, x == y))
fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x)) # 多判或者错判
fn = np.sum(np.logical_and(y > 0, x == 0)) # 漏判
P = float(tp) / (float(tp + fp) + 1e-8)
R = float(tp) / (float(tp + fn) + 1e-8)
F = 2 * P * R / (P + R + 1e-8)
if verbose:
print('P->', P)
print('R->', R)
print('F->', F)
return F
class BasicStaticModel(BasicModel):
def __init__(self, config=None, n_folds=5, name='BasicStaticModel'):
self.n_folds = n_folds
self.name = name
self.config = config
self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
def train_predict(self, train, train_y, test, option=None):
name = self.name
predict = np.zeros((test.shape[0], 10, 4))
oof_predict = np.zeros((train.shape[0], 10, 4))
scores_f1 = []
for train_index, dev_index in self.kf.split(train):
kfold_X_train, kfold_X_val = train[train_index], train[dev_index]
y_train, y_dev = train_y[train_index], train_y[dev_index]
model_dict = {}
print('start train model:')
for idx in tqdm(range(10)):
label = y_train[:, idx]
model = self.create_model()
model.fit(kfold_X_train, label)
model_dict[idx] = model
print('complete train model')
print('start validate model')
f1_scores = []
for idx in tqdm(range(10)):
label_dev = y_dev[:, idx]
model = model_dict[idx]
dev_prob = model.predict_proba(kfold_X_val)
test_prob = model.predict_proba(test)
oof_predict[dev_index, idx] = dev_prob
predict[:, idx] += test_prob / self.n_folds
dev_predict = np.argmax(dev_prob, 1)
f1_scores.append(self.get_f1_score(dev_predict, label_dev))
f1_score = np.mean(f1_scores)
scores_f1.append(f1_score)
print('f1_scores-> ', f1_scores)
print('f1_score: ', f1_score)
if self.config.is_debug == True:
break
print('Total f1->', scores_f1)
print("Total f1'mean is ", np.mean(scores_f1))
# 保存结果
os.makedirs('../data/result-ml', exist_ok=True)
with open('../data/result-ml/{}_oof_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
pickle.dump(oof_predict, f)
with open('../data/result-ml/{}_pre_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
pickle.dump(predict, f)
print('done')
class Fasttext(BasicStaticModel):
def __init__(self, name='basicModel', n_folds=5, config=None):
BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
def create_model(self):
# 重写
sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
wordNgrams=1,
minCount=5, verbose=2)
return sk_clf