整体架构
- 读取数据
- 训练模型
分部细节
生成训练数据
从hadoop生成两个子文件
(1)docid,pv,cl
(2)mid,cl_docid_duration_dict, pv_docid_set
生成训练数据:
get_docid_doctype_videotime_by_kv.py
gen_mid_docid.py
gen_train_data.py
生成的结果是mid,docid,score其中score的计算是重点,如下:
def evaluate_score(docid_duration, docid_doctype_videotime_dict):
items0 = docid_duration.split(':')
if len(items0) != 2:
return 0
docid = items0[0]
duration = int(items0[1]) if items0[1] != '0' else 5
doctype, videotime = docid_doctype_videotime_dict[docid]
if doctype == 0:
duration = 180 if duration > 180 else duration
score = 0.5 + 0.5 * (duration/180.0)
if doctype == 1:
videotime = 60 if videotime < 60 else (360 if videotime > 360 else videotime)
duration = videotime if duration > videotime else duration
score = 0.5 + 0.5 * (duration*1.0 / videotime) * (0.5 + 0.1 * (videotime - 60)/60.0)
if doctype == 2:
videotime = 10 if videotime < 10 else (360 if videotime > 360 else videotime)
duration = videotime if duration > videotime else duration
score = 0.5 + 0.5 * (duration*1.0 / videotime)
return score
bm25矩阵分解
读取第一步得到的mid,docid,score,生成三个列表userid_list, itemid_list, score_list
训练模型,得到item和user的embedding
from scipy import sparse
import implicit
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender, TFIDFRecommender, bm25_weight)
def train_model():
global userid_list, itemid_list, score_list, max_userid, max_itemid, csr_item_user_mat, model
coo_user_item_mat = sparse.coo_matrix((score_list, (userid_list, itemid_list)), shape=(max_userid + 1, max_itemid + 1))
coo_item_user_mat = coo_user_item_mat.T
coo_item_user_mat = bm25_weight(coo_item_user_mat, K1=100, B=0.8)
csr_item_user_mat = coo_item_user_mat.tocsr()
model = implicit.als.AlternatingLeastSquares(factors=100, iterations=10, calculate_training_loss=True)
model.fit(csr_item_user_mat)
def output_embedding(user_embedding_filepath, item_embedding_filepath):
global model
fw0 = open(user_embedding_filepath, 'w')
fw1 = open(item_embedding_filepath, 'w')
for i,emb in enumerate(model.user_factors):
fw0.write(str(i) + '\t' + ','.join(map(str,emb)) + '\n')
for i,emb in enumerate(model.item_factors):
fw1.write(str(i) + '\t' + ','.join(map(str,emb)) + '\n')
fw0.close()
fw1.close()
这里介绍几个知识点:
- sparse.coo_matrix()
将此矩阵转换为压缩稀疏行格式,重复的条目将汇总在一起
参考 - bm25_weight
其实我们可以用numpy或者是自己写公式完成矩阵分解的过程,但是会比较慢,这里提供一个比较快的方法就是利用implicit库中的bm25算法
参考
lightfm
from lightfm import LightFM
from lightfm.data import Dataset
def train_model():
global mid_list, docid_list, score_list, dataset, _id_user_mapping, _id_item_mapping, model
dataset = Dataset()
dataset.fit(users=np.unique(mid_list), items=np.unique(docid_list))
print 'len(dataset._user_id_mapping):', len(dataset._user_id_mapping)
print 'len(dataset._item_id_mapping):', len(dataset._item_id_mapping)
_id_user_mapping = {v: k for k, v in dataset._user_id_mapping.items()}
_id_item_mapping = {v: k for k, v in dataset._item_id_mapping.items()}
train_interactions, train_weights = dataset.build_interactions((mid_list[i], docid_list[i], score_list[i]) for i in range(len(score_list)))
model = LightFM(no_components=100, loss='warp')
print 'model.fit start ...'
model.fit(interactions=train_interactions, sample_weight=train_weights, epochs=my_epochs, num_threads=20, verbose=True)
print 'model.fit end ...'
item_biased, item_representations = model.get_item_representations()
print item_biased.shape
#print item_biased
print item_representations.shape
#print item_representations
user_biases, user_embeddings = model.get_user_representations()
print user_biases.shape
#print user_biases
print user_embeddings.shape