Fasttext文本分类代码实现

数据集下载：链接：https://pan.baidu.com/s/1zxrKtTYli2iQgK1iNVP9PQ 提取码：la3w

实际上fasttext和cbow模型类似，就是一个简单的神经网络。
在这里插入图片描述
N-gram subwords
以独立单词进行训练学习，将会产生一系列问题。
低频词、罕见词得不到足够的训练，效果不佳；
未登录词（没有出现过的词，或者某些拼写错误的词）传统模型更加无能为力。

将一个单词打散到字符级别，并且利用字符级别的n-gram信息来捕捉字符间的顺序关系
原理：西方语言文字常常通过前缀、后缀、字根来构词，汉语也有单字表义的传统
Google：为了表达单词前后边界，我们加入<>两个字符，即变形为“”。假设我们希望抽取所有的tri-gram信息，可以得到如下集合：G = { <go, goo, oog,ogl, gle, le>}。在实践中，我们往往会同时提取单词的多种n-gram信息，如2/3/4/5-gram。这样，原始的一个单词google，就被一个字符级别的n-gram集合所表达。

在训练过程中，每个n-gram都会对应训练一个向量，而原来完整单词的词向量就由它对应的所有n-gram的向量求和得到。所有的单词向量以及字符级别的n-gram向量会同时相加求平均作为训练模型的输入。
从实验效果来看，subword n-gram信息的加入，不但解决了低频词未登录词的表达的问题，而且对于最终任务精度一般会有几个百分点的提升。唯一的问题就是由于需要估计的参数多，模型可能会比较膨胀。

在这里插入图片描述

import os
import re
import jieba
import pandas as pd
import tensorflow as tf
import numpy as np
tf.__version__

root='data/百度题库/高中_历史/origin'
ancient_his_df=pd.read_csv(os.path.join(root,'古代史.csv'))
contemporary_his_df=pd.read_csv(os.path.join(root,'现代史.csv'))
modern_his_df=pd.read_csv(os.path.join(root,'近代史.csv'))
ancient_his_df['label']='__label__古代史'
contemporary_his_df['label']='__label__现代史'
modern_his_df['label']='__label__近代史'

def load_stop_words(stop_word_path):
    file=open(stop_word_path,'r',encoding='utf-8')
    stop_words=file.readlines()
    stop_words=[stop_word.strip() fo

stopwords_path='data/stopwords/哈工大停用词表.txt'
def clean_sentence(line):
    line = re.sub(
            "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目", '',line)
    tokens = jieba.cut(line, cut_all=False)
    return tokens 
stop_words=load_stop_words(stopwords_path)
def sentence_proc(sentence):
    words=clean_sentence(sentence)
    words=[word for word in words if word not in stop_words]
    return ' '.join(words)

ancient_his_df['item']=ancient_his_df['item'].apply(sentence_proc)
contemporary_his_df['item']=contemporary_his_df['item'].apply(sentence_proc)
modern_his_df['item']=modern_his_df['item'].apply(sentence_proc)
dataset_df=pd.concat([ancient_his_df,contemporary_his_df,modern_his_df])

#fasttext部分
max_features=20000
class_num=3
ngram_range=2
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer
from tensorflow.keras import Input,Model
from tensorflow.keras.layers import Embedding,GlobalAveragePooling1D,Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
corpus=dataset_df['item']
text_preprocesser=preprocessing.text.Tokenizer(num_words=max_features,oov_token='<UNK>')
text_preprocesser.fit_on_texts(corpus)
x=text_preprocesser.texts_to_sequences(corpus)
word_dict=text_preprocesser.word_index
#保存字典
with open('data/vocab.txt','w',encoding='UTF8') as f:
    for k,v in word_dict.items():
        f.write(f'{k}\t{str(v)}\n')
lb=LabelBinarizer()
lb.fit(dataset_df['label'])
print(lb.classes_)
y=lb.transform(dataset_df['label'])#转换成one-hot编码
#划分训练集测试集
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
#下面两个函数是实现添加ngram特征的，但是需要进行筛选要不然得到的结果会不好（因为有的组合词频很低，不利于分类）
def create_ngram_set(input_list,ngram_value=2):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def add_ngram(sequences,token_indice,ngram_range=2):
    new_sequences=[]
    for input_list in sequences:
        new_list=input_list[:]
        for ngram_value in range(2,ngram_range+1):
            for i in range(len(new_list)-ngram_value+1):
                ngram=tuple(new_list[i:i+ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return np.array(new_sequences)
if ngram_range>1:
    print('adding{}-gram features'.format(ngram_range))
    ngram_set=set()
    for input_list in X_train:
        for i in range(2,ngram_range+1):
            set_of_ngram=create_ngram_set(input_list,ngram_value=i)
            ngram_set.update(set_of_ngram)
    start_index=max_features+1
    token_indice={v:k+start_index for k,v in enumerate(ngram_set)}
    indice_token={token_indice[k]:k for k in token_indice}
    max_features=np.max(list(indice_token.keys()))+1
    X_train=add_ngram(X_train,token_indice,ngram_range)
    X_test=add_ngram(X_test,token_indice,ngram_range)
    
print('average train sequence length:{}'.format(np.mean(list(map(len,X_train)),dtype=int)))
print('average test sequence length:{}'.format(np.mean(list(map(len,X_test)),dtype=int)))
maxlen=int(np.ceil(max(np.mean(list(map(len,X_train))),np.mean(list(map(len,X_test))))))
print('averagelength:{}'.format(maxlen))
#average train sequence length:256
#average test sequence length:220
#averagelength:257
X_train=sequence.pad_sequences(X_train,maxlen=maxlen,padding='post',truncating='post')
X_test=sequence.pad_sequences(X_test,maxlen=maxlen,padding='post',truncating='post')
print('x_train shape:',X_train.shape)
print('x_test shape:',X_test.shape)
#x_train shape: (3976, 257)
#x_test shape: (994, 257)
#fasttext
#1.第一层是embedding (batch sequence embedding)
#2。全局平均池化1D
#3.最后一层与单个节点紧密连接，使用sigmoid激活函数
#构建模型
class FastText(object):
    def __init__(self,maxlen,max_features,embedding_dims,class_num=1,
                last_activation='sigmoid'):
        self.maxlen=maxlen
        self.max_features=max_features
        self.embedding_dims=embedding_dims
        self.class_num=class_num
        self.last_activation=last_activation
    def get_model(self):
        input=Input((self.maxlen,))
        embedding=Embedding(self.max_features,self.embedding_dims,input_length=self.maxlen)(input)
        x=GlobalAveragePooling1D()(embedding)
        output=Dense(self.class_num,activation=self.last_activation)(x)
        model=Model(inputs=input,outputs=output)
        return model
#评估
batch_size=128
embedding_dims=300
epochs=20
model=FastText(maxlen,max_features,embedding_dims,class_num).get_model()
model.compile('adam','binary_crossentropy',metrics=['accuracy'])
model.summary()

print('Train...')
early_stopping=EarlyStopping(monitor='val_accuracy',patience=5,mode='max')

history=model.fit(X_train,y_train,
                 batch_size=batch_size,
                 epochs=epochs,
                 workers=32,
                 use_multiprocessing=True,
                 callbacks=[early_stopping],
                 validation_data=(X_test,y_test))
from sklearn.metrics import classification_report,multilabel_confusion_matrix,confusion_matrix
results=model.evaluate(X_test,y_test,verbose=2)
print(results)
#994/1 - 4s - loss: 0.2895 - accuracy: 0.8115
[0.3930790154506983, 0.8115359]
import matplotlib.pyplot as plt
history_dict=history.history
history_dict.keys()
acc=history_dict['accuracy']
val_acc=history_dict['val_accuracy']
loss=history_dict['loss']
val_loss=history_dict['val_loss']
epochs=range(1,len(acc)+1)
plt.plot(epochs,loss,'bo',label='Train_loss')
plt.plot(epochs,val_loss,'b',label='Validation loss')
plt.show()
print('test...')
y_pred=model.predict(X_test)
y_pred=y_pred.argmax(axis=1)
y_true=y_test.argmax(axis=1)
print(classification_report(y_true,y_pred))
confusion_matrix(y_true,y_pred)
#test...
#              precision    recall  f1-score   support
#
#           0       0.91      0.84      0.87       213
#           1       0.67      0.75      0.71       451
#           2       0.65      0.58      0.61       330
#
#    accuracy                           0.71       994
#   macro avg       0.74      0.72      0.73       994
#weighted avg       0.72      0.71      0.71       994
#下图为模型结构和迭代优化曲线

在这里插入图片描述

weixin_42813521

发布了12 篇原创文章 · 获赞 1 · 访问量 192

私信关注

Fasttext文本分类代码实现

Fasttext文本分类代码实现

猜你喜欢