#! /usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import os
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
import argparse
import math
import numpy as np
parser = argparse.ArgumentParser(description='filter_word_pair')
parser.add_argument('--output', type=str, default="out.syn.news",help='output')
parser.add_argument('--threads', type=int, default=30,help='thread num')
parser.add_argument('--glsa_dict', type=str, default="glsa.txt",help='glsa dict')
parser.add_argument('--lsi_dict', type=str, default="lsi.txt",help='lsi dict')
parser.add_argument('--word2vec_dict', type=str, default="word2vec.bin",help='word2vec dict')
args = parser.parse_args()
def cosin(vec1,vec2):
if len(vec1) != len(vec2):
return 0
norm1 = 0.0
for t in vec1:
norm1 += t*t
norm1 = math.sqrt(norm1)
norm2 = 0.0
for t in vec2:
norm2 += t*t
norm2 = math.sqrt(norm2)
if norm1 < 0.0000001 or norm2 < 0.0000001:
return 0
sum = 0.0
for i in range(len(vec1)):
sum += vec1[i]*vec2[i]
return sum*1.0/(norm1*norm2)
def weighted_cosin(vec1,vec2,sigma):
if len(vec1) != len(vec2):
return 0
norm1 = 0.0
for i in range(len(vec1)):
norm1 += vec1[i]*vec1[i]*sigma[i]*sigma[i]
norm1 = math.sqrt(norm1)
norm2 = 0.0
for i in range(len(vec2)):
norm2 += vec2[i]*vec2[i]*sigma[i]*sigma[i]
norm2 = math.sqrt(norm2)
if norm1 < 0.0000001 or norm2 < 0.0000001:
return 0
sum = 0.0
for i in range(len(vec1)):
sum += vec1[i]*vec2[i]*sigma[i]*sigma[i]
return sum*1.0/(norm1*norm2)
class LSIDict(object):
def __init__(self):
self.data = {}
self.sigma = []
self.dim = 0
def load_lsi_from_binfile(self,word_vec_file):
with open(word_vec_file, "rb") as f:
self.data = {}
self.sigma = []
self.dim = int(f.readline())
self.sigma = map(float,f.readline().split("\t"))
dlist = f.readlines()
for line in dlist:
if len(line) == 0:
break
tlist = line.split("\t")
if len(tlist) != 2:
continue
word = tlist[0]
self.data[word] = map(float,tlist[1].split(" "))
def calc_sim(self,term1,term2):
if cmp(term1,term2) == 0:
return 1
if term1 not in self.data or term2 not in self.data:
return 0
return weighted_cosin(self.data[term1],self.data[term2],self.sigma)
def test_suite(self):
print "北京:上海",self.calc_sim("北京","上海")
print "北大:清华",self.calc_sim("北大","清华")
print "北大:aa",self.calc_sim("北大","aa")
class Word2vecDict(object):
def __init__(self):
self.data = {}
def load_wordvecs_from_binfile(self,word_vec_file):
with open(word_vec_file, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
self.data[word] = np.fromstring(f.read(binary_len), dtype='float32')
def calc_sim(self,term1,term2):
if cmp(term1,term2) == 0:
return 1
if term1 not in self.data or term2 not in self.data:
return 0
return cosin(self.data[term1],self.data[term2])
glist = []
count = 0
for line in open("./out.syn.top5"):
line = line.strip()
if not line:
continue
tlist = line.split("\t")
#(terma,termb) = field[0].split("|")
if len(tlist) != 5:
continue
terma = tlist[0]
termb = tlist[1]
if cmp(terma,termb) == 0:
continue
count += 1
if count%10000 == 0:
print >> sys.stderr, "handled %d lines" %(count)
glist.append((terma,termb))
print >> sys.stderr, "start load Word2vecDict"
word2vec_dict = Word2vecDict()
word2vec_dict.load_wordvecs_from_binfile(args.word2vec_dict)
glsa_dict = LSIDict()
print "start load GLSADict"
glsa_dict.load_lsi_from_binfile(args.glsa_dict)
glsa_dict.test_suite()
print "end load GLSADict"
lsi_dict = LSIDict()
print "start load LSIDict"
lsi_dict.load_lsi_from_binfile(args.lsi_dict)
lsi_dict.test_suite()
print "end load LSIDict"
lock = Lock()
def calc_sim(tasks,begin,end,out):
for i in range(begin,end):
qf = tasks[i]
sim1 = glsa_dict.calc_sim(qf[0],qf[1])
sim2 = lsi_dict.calc_sim(qf[0],qf[1])
sim3 = word2vec_dict.calc_sim(qf[0],qf[1])
with lock:
out.put((qf[0],qf[1],sim1,sim2,sim3))
#w1,w2,prob,ratio,glsa,lsi,word2vec
#计算cosin时采用多进程
Kthread = int(args.threads)
load = len(glist)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
print >>sys.stderr, "load=%d thread %d begin=%d end=%d" %(load,i,begin,end)
th = Process(target=calc_sim,args=(glist,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
fout = open(args.output,"w")
while not out.empty():
info = out.get()
#zero
fout.write("%s\t%s\t%f\t%f\t%f\n" %(info[0],info[1],info[2],info[3],info[4]))
# -*- coding:utf-8 -*-
import sys
import os
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
import argparse
import math
import numpy as np
parser = argparse.ArgumentParser(description='filter_word_pair')
parser.add_argument('--output', type=str, default="out.syn.news",help='output')
parser.add_argument('--threads', type=int, default=30,help='thread num')
parser.add_argument('--glsa_dict', type=str, default="glsa.txt",help='glsa dict')
parser.add_argument('--lsi_dict', type=str, default="lsi.txt",help='lsi dict')
parser.add_argument('--word2vec_dict', type=str, default="word2vec.bin",help='word2vec dict')
args = parser.parse_args()
def cosin(vec1,vec2):
if len(vec1) != len(vec2):
return 0
norm1 = 0.0
for t in vec1:
norm1 += t*t
norm1 = math.sqrt(norm1)
norm2 = 0.0
for t in vec2:
norm2 += t*t
norm2 = math.sqrt(norm2)
if norm1 < 0.0000001 or norm2 < 0.0000001:
return 0
sum = 0.0
for i in range(len(vec1)):
sum += vec1[i]*vec2[i]
return sum*1.0/(norm1*norm2)
def weighted_cosin(vec1,vec2,sigma):
if len(vec1) != len(vec2):
return 0
norm1 = 0.0
for i in range(len(vec1)):
norm1 += vec1[i]*vec1[i]*sigma[i]*sigma[i]
norm1 = math.sqrt(norm1)
norm2 = 0.0
for i in range(len(vec2)):
norm2 += vec2[i]*vec2[i]*sigma[i]*sigma[i]
norm2 = math.sqrt(norm2)
if norm1 < 0.0000001 or norm2 < 0.0000001:
return 0
sum = 0.0
for i in range(len(vec1)):
sum += vec1[i]*vec2[i]*sigma[i]*sigma[i]
return sum*1.0/(norm1*norm2)
class LSIDict(object):
def __init__(self):
self.data = {}
self.sigma = []
self.dim = 0
def load_lsi_from_binfile(self,word_vec_file):
with open(word_vec_file, "rb") as f:
self.data = {}
self.sigma = []
self.dim = int(f.readline())
self.sigma = map(float,f.readline().split("\t"))
dlist = f.readlines()
for line in dlist:
if len(line) == 0:
break
tlist = line.split("\t")
if len(tlist) != 2:
continue
word = tlist[0]
self.data[word] = map(float,tlist[1].split(" "))
def calc_sim(self,term1,term2):
if cmp(term1,term2) == 0:
return 1
if term1 not in self.data or term2 not in self.data:
return 0
return weighted_cosin(self.data[term1],self.data[term2],self.sigma)
def test_suite(self):
print "北京:上海",self.calc_sim("北京","上海")
print "北大:清华",self.calc_sim("北大","清华")
print "北大:aa",self.calc_sim("北大","aa")
class Word2vecDict(object):
def __init__(self):
self.data = {}
def load_wordvecs_from_binfile(self,word_vec_file):
with open(word_vec_file, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
self.data[word] = np.fromstring(f.read(binary_len), dtype='float32')
def calc_sim(self,term1,term2):
if cmp(term1,term2) == 0:
return 1
if term1 not in self.data or term2 not in self.data:
return 0
return cosin(self.data[term1],self.data[term2])
glist = []
count = 0
for line in open("./out.syn.top5"):
line = line.strip()
if not line:
continue
tlist = line.split("\t")
#(terma,termb) = field[0].split("|")
if len(tlist) != 5:
continue
terma = tlist[0]
termb = tlist[1]
if cmp(terma,termb) == 0:
continue
count += 1
if count%10000 == 0:
print >> sys.stderr, "handled %d lines" %(count)
glist.append((terma,termb))
print >> sys.stderr, "start load Word2vecDict"
word2vec_dict = Word2vecDict()
word2vec_dict.load_wordvecs_from_binfile(args.word2vec_dict)
glsa_dict = LSIDict()
print "start load GLSADict"
glsa_dict.load_lsi_from_binfile(args.glsa_dict)
glsa_dict.test_suite()
print "end load GLSADict"
lsi_dict = LSIDict()
print "start load LSIDict"
lsi_dict.load_lsi_from_binfile(args.lsi_dict)
lsi_dict.test_suite()
print "end load LSIDict"
lock = Lock()
def calc_sim(tasks,begin,end,out):
for i in range(begin,end):
qf = tasks[i]
sim1 = glsa_dict.calc_sim(qf[0],qf[1])
sim2 = lsi_dict.calc_sim(qf[0],qf[1])
sim3 = word2vec_dict.calc_sim(qf[0],qf[1])
with lock:
out.put((qf[0],qf[1],sim1,sim2,sim3))
#w1,w2,prob,ratio,glsa,lsi,word2vec
#计算cosin时采用多进程
Kthread = int(args.threads)
load = len(glist)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
print >>sys.stderr, "load=%d thread %d begin=%d end=%d" %(load,i,begin,end)
th = Process(target=calc_sim,args=(glist,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
fout = open(args.output,"w")
while not out.empty():
info = out.get()
#zero
fout.write("%s\t%s\t%f\t%f\t%f\n" %(info[0],info[1],info[2],info[3],info[4]))