#!/usr/bin/python # -*- coding: UTF-8 -*- import sys import codecs #可以以特定编码打开文件 import numpy as np reload(sys) #zzh说这种方法不好,不要再用了!!! 可是真的很好用啊 QAQ sys.setdefaultencoding('utf-8') import gensim #model = gensim.models.Word2Vec.load("22620491.model") model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True) word_vec = model.wv del model #把模型给word_vec,所以Model删掉。 ''' print word_vec[u'难过'] ''' f = codecs.open("xlj_fenci.txt",'r','utf-8') #codecs包指定TXT打开方式 lines = f.readlines() doc = open('xlj_vec.txt', 'w') for line in lines: #每一行弹幕 if lines.index(line) % 100 ==0: #显示跑到多少条数据 print lines.index(line) list=[] for i in range(0,64): list.append(0) list=np.array(list) #转化为array便于加减乘除 words=line.split(" ",1)[1].split(" ") count=0 for word in words: if word != "\r\n": print(word) try: print(word_vec[word]) count=count+1 print(type(word_vec[word])) print(type(list)) list=word_vec[word]+list #向量对应值相加 print(list) print(count) except: continue print(count) if count !=0: list=list/count print(list) list=list.tolist() # 变回List容易增加项 和 SVM分类 list.append(int(float(line.split(" ",1)[0].split(" ",5)[2]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[3]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[4]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[5]))) print(list) doc.write(line.split(" ",2)[0]+" "+str(list)+"\r\n") print("end") f.close() doc.close()
词向量求平均合成句向量
猜你喜欢
转载自blog.csdn.net/qq_35398413/article/details/81148256
今日推荐
周排行