# 下载相应数据集
# wget https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip
import pandas as pd
import json
import jieba
## 0.gesim词向量实战
# 1.读取预处理的数据集
# 2.训练
# 3.结果
# 1.1数据预处理
def get_sentence(data_file):
# 读取文件
f = open(data_file, encoding='utf-8')
reader = f.readlines()
sentence = []
for line in reader:
line = json.loads(line.strip())
sentence.append(line['sentence'])
return sentence
train_sentence = get_sentence(r'tnews_public/train.json')
test_sentence = get_sentence(r'tnews_public/test.json')
dev_sentence = get_sentence(r'tnews_public/dev.json')
# 全量数据集
train_data = train_sentence+test_sentence+dev_sentence
train_data = [list(jieba.cut(sen))for sen in train_data]
print(train_data)
print(len(train_data))
# 3.构建词向量模型
from gensim.models.word2vec import LineSentence
from gensim.models import word2vec
import gensim
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)',level=logging.INFO)
# 构建模型
from gensim.models import FastText
# print(help(FastText))
"""
sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025,
window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
max_final_vocab=None
"""
# model = FastText(train_data,vector_size=4,window=3,min_count=1,epochs=10)
"""
sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
comment=None, max_final_vocab=None,
"""
## skip-gram与CBOW
model = word2vec.Word2Vec(train_data,sg=1,workers=4,min_count=4,vector_size=200,epochs=1)
# 查找一个最近的词
print(model.wv.most_similar(['金融'],topn=10))
model_save_path = 'word2vec.model'
model.save(model_save_path)
# 载入模型
model = word2vec.Word2Vec.load(model_save_path)
文本表示方法--单词嵌入向量(word2vec)
猜你喜欢
转载自blog.csdn.net/Cocktail_py/article/details/119857792
今日推荐
周排行