20180923 word2vec相似度改进(不浪费句子)

没有词向量就丢掉,不放进词集合里,不浪费句子,的改进。


#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import division  #除法
import sys
import codecs   #可以以特定编码打开文件
import jieba
import jieba.posseg as pseg
reload(sys)               #zzh说这种方法不好,不要再用了!!!  可是真的很好用啊 QAQ
sys.setdefaultencoding('utf-8')
import gensim

# model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model     #把模型给word_vec,所以Model删掉。

print word_vec[u'难过']

f = codecs.open("xlj_fenci.txt",'r','utf-8')    #codecs包指定TXT打开方式
lines = f.readlines()
#doc = open('fenlei.txt', 'w')


right,wrong,total=0,0,0
cntl,cnta,cntn,cntj,cntw=0,0,0,0,0   #标注时每一类的数量
resl,resa,resn,resj,resw=0,0,0,0,0  #分类正确每一类结果数量
for line in lines: #每一行弹幕

    if lines.index(line) % 500 ==0:   #显示跑到多少条数据
        print lines.index(line)
    if line.split("  ")[0].split(" ")[0]=="0":    #分类正确个数
        cntl=cntl+1
    elif line.split("  ")[0].split(" ")[0]=="1":
        cnta=cnta+1
    elif line.split("  ")[0].split(" ")[0]=="2":
        cntn=cntn+1
    elif line.split("  ")[0].split(" ")[0]=="3":
        cntj=cntj+1
    elif line.split("  ")[0].split(" ")[0]=="4":
            cntw=cntw+1
    line1=line.split("  ")[1]
    words=line1.split(" ")
    u = []
    for word in words:
            if word != "\r\n":    #去掉换行符,linux只用\n换行。win下用\r\n表示换行。反正\n不行就\r\n试试!
                #print type(word)
                try:
                    word_vec[word]
                    u.append(word)    #word_vec输入必须要unicode才行。
                except:
                    continue



    le = [u'乐']
    ai = [u'哀']
    nu = [u'怒']
    jing = [u'惊']
    wu = [u'恶']
    try:
        l,a,n,j,w=word_vec.n_similarity(u, le),word_vec.n_similarity(u, ai),word_vec.n_similarity(u, nu),word_vec.n_similarity(u, jing),word_vec.n_similarity(u, wu)
        list=[l,a,n,j,w]
#         print list
        #doc.write(line.split(" ",1)[0]+" "+str(list.index(max(list)))+" "+line.split(" ",1)[1]+'\n')      # index记得 变成 str啊 !!改了好半天!
        if str(list.index(max(list)))==line.split("  ")[0].split(" ")[0]:
            right=right+1
            if str(list.index(max(list)))=="0":    #分类正确个数
                resl=resl+1
            elif str(list.index(max(list)))=="1":
                resa=resa+1
            elif str(list.index(max(list)))=="2":
                resn=resn+1
            elif str(list.index(max(list)))=="3":
                resj=resj+1
            elif str(list.index(max(list)))=="4":
                resw=resw+1

            


    except:
        #doc.write(line.split(" ",1)[0]+" "+"-1"+" "+line.split(" ",1)[1]+'\n')
        wrong=wrong+1
        continue
    total=total+1
print(right,wrong,total)
print(cntl,cnta,cntn,cntj,cntw)
print(resl,resa,resn,resj,resw)
print(resl/cntl,resa/cnta,resn/cntn,resj/cntj,resw/cntw)
print("end")
f.close()
#doc.close()

猜你喜欢

转载自blog.csdn.net/qq_35398413/article/details/82821165