python 推荐算法实例,电影推荐,挖局隐含特征推荐,LFM推荐算法

推荐算法实例代码:

1.数据处理过程,主要涉及数据的读取,文件data_process.py

import pandas as pd
import os
import csv

def get_item_info(input_file):
    """
    得到Item的信息
    input_file: Item的文件地址
    return:
        dict:  {itemID:[item_info]}
    """
    item_info = {}
    if not os.path.exists(input_file):
        return {}

    with open(input_file, "r", encoding='utf-8') as file:
        lines = csv.reader(file)
        i = 0
        for line in lines:            # 遍历每一条信息
            if i == 0:                # 跳过表头
                i += 1
                continue
            else:
                item_info[line[0]] = line[1:]
    return item_info


def get_average_score(input_file):
    """
    得到Item的平均得分
    input_file: Item的打分文件 ratings.csv
    return:
        dict  {ItemID:average_score}
    """
    score_dict = {}
    if not os.path.exists(input_file):
        return {}
    ratings_data = pd.read_csv(input_file)
    ratings_mean_score = ratings_data[["movieId", "rating"]].groupby("movieId").agg("mean")     # 对item分组求均值

    movieID = ratings_mean_score.index.values.astype("str")         # 将itemID 转化为str型
    mean_score = ratings_mean_score["rating"].values.round(3)       # 将均值保留三位小数

    movieID_mean_score_zip = zip(movieID, mean_score)
    for movieID, score in movieID_mean_score_zip:
        score_dict[movieID] = score
    return score_dict


def get_train_data(input_file):
    """
    得到LFM的训练数据
    input_file: user、item rating 文件
    return:
        list  [(userID, itemID, label), (userID, itemID, label)]
    """
    if not os.path.exists(input_file):
        return []
    score_dict = get_average_score(input_file)         # item的平均得分
    pos_dict, neg_dict = {}, {}                  # 正样本, 负样本
    train_data = []                 # 训练集
    threshold = 4.0                 # 阈值 (大于该值,为正样本;否则为负样本)

    with open(input_file, "r", encoding='utf-8') as file:
        lines = csv.reader(file)
        i = 0
        for line in lines:
            if i == 0:             # 跳过表头
                i += 1
                continue
            userID, itemID, rating = line[0], line[1], float(line[2])
            if userID not in pos_dict:
                pos_dict[userID] = []
            if userID not in neg_dict:
                neg_dict[userID] = []
            if rating > threshold:      # rating 大于 4.0,正样本;添加到正样本中的
                pos_dict[userID].append((itemID, 1))
            else:
                score = score_dict.get(itemID, 0)    # 否则,获取该item 对应的平均得分;添加到负样本中
                neg_dict[userID].append((itemID, score))

    # 均衡正负样本
    for userID in pos_dict:
        data_num = min(len(pos_dict[userID]), len(neg_dict.get(userID, [])))  # 对于某用户,取其正负样本最小的数量为最终正负样本的数量
        if data_num > 0:
            train_data += [(userID, pos_data[0], pos_data[1]) for pos_data in pos_dict[userID]][: data_num]    # 正样本取data_num个
        else:
            continue
        sorted_neg_list = sorted(neg_dict[userID], key=lambda x: x[1], reverse=True)[: data_num]    # 根据评分对负样本排序,取前data_num个为负样本!
        train_data += [(userID, neg_data[0], 0) for neg_data in sorted_neg_list]
    return train_data


if __name__ == '__main__':
    input_file = "./data/ratings.csv"                # 评分表
    train_data = get_train_data(input_file)
    print(train_data[:10])

 2.lfm的训练,数据的预测,需要导入上面的文件data_process.py,处理流程大致如下:

import numpy as np
# import sys
# sys.path.append()

from data_process import *         # 导入数据处理脚本,注意位置
import operator
from tqdm import *


def init_vector(train_data, vector_len):
    """
    初始化user和item的向量
    train_data: 训练数据
    vector_len: 向量的长度
    return:
        user vector and item vector
    """
    init_user_vec = {}
    init_item_vec = {}
    for data_instance in train_data:
        userID, itemID, _ = data_instance
        init_user_vec[userID] = np.random.randn(vector_len)
        init_item_vec[itemID] = np.random.randn(vector_len)
    return init_user_vec, init_item_vec


def lfm_train(train_data, F, alpha, learning_rate, step):
    """
    采用梯度下降,不断更新迭代 user_vector and item_vector
    train_data: 训练数据集
    F:     隐特征数
    alpha:  正则化系数
    learning_rate: 学习率
    step: 迭代轮次
    return:
        dict  {userID: [user_vector]}
        dict  {itemID: [user_vector]}
    """
    user_vec, item_vec = init_vector(train_data, F)            # 随机初始化user and item vector
    for _ in tqdm(range(step), desc="训练进度: "):             # 训练轮次
        for data_instance in (train_data):
            userID, itemID, label = data_instance
            user_vector, item_vector = user_vec[userID], item_vec[itemID]               # user vector, item vector
            vector_dot = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector))    # 向量点积(用户对物品的喜爱程度)
            loss = label - vector_dot                    # loss
            user_vector += np.multiply(learning_rate, (loss * item_vector - alpha * user_vector))    # user_vector 不断更新(根据梯度下降公式)
            item_vector += np.multiply(learning_rate, (loss * user_vector - alpha * item_vector))    # item_vector 不断更新(根据梯度下降公式)
        learning_rate = learning_rate * 0.95                  # 学习率衰减
    return user_vec, item_vec

def top_n_item(user_vec, item_vec, userid):
    """
    计算某用户最喜爱的top_n个物品
    user_vec:   用户向量
    item_vec:   物品向量
    userid:     用户ID
    return:
        a list: [(item, score), (item1, score1),,,]
    """
    top_n = 10
    if userid not in user_vec:
        return []
    item_score = {}          # 存放用户对某物品的喜爱程度
    top_list = []            # top_n喜欢的物品
    user_vector = user_vec[userid]          # 获取指定user对应的vector
    for itemID in item_vec:                 # 遍历所有的item
        item_vector = item_vec[itemID]      # 拿到item对应的vector
        vector_dot = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector))   # 计算user和item1的相似度(喜爱程度)
        item_score[itemID] = vector_dot                # 将该用户对所有的物品喜爱程度写入字典中

    # 以喜爱程度排序,取前top_n个用户最喜欢的item,返回
    for item_i in sorted(item_score.items(), key=operator.itemgetter(1), reverse=True)[:top_n]:
        itemID, score= item_i[0], round(item_i[1], 3)
        top_list.append((itemID, score))        # 添加到最终的列表
    return top_list

def print_top_result(train_data, userid, top_n):
    """
    打印某用户以前看过的物品和算法推荐的物品
    :param train_data: 用户之前喜欢的item
    :param userid:     用户id
    :param recom_list: 推荐的item
    :return:
    """
    item_info = get_item_info("./data/movies.csv")     # item的信息
    print("----------------------------User clicked item----------------------------")
    for data_instance in train_data:                  # 遍历数据集
        userID_tem, itemID, label = data_instance
        if userID_tem == userid and label == 1:       # 若打印某user喜爱的物品(正样本)
            print(f"[User clicked items] itemID : {itemID} ; item_info : {item_info[itemID]} ")
            # print(userid)

    print("----------------------------Recommendation result of LFM algorithm----------------------------")

    for item_i in top_n:
        print(f"[Recommended items] itemID : {item_i[0]} ; item_info : {item_info[item_i[0]]} ; 喜爱程度: {item_i[1]}")

if __name__ == '__main__':
    train_data = get_train_data("./data/ratings.csv")
    user_vec, item_vec = lfm_train(train_data, 32, 0.01, 0.1, 2)            # 训练
    userID = '10'                  # 以用户ID‘10’为例,查看其推荐结果
    top_result = top_n_item(user_vec, item_vec, userID)
    print_top_result(train_data, userID, top_result)

数据如下:

https://download.csdn.net/download/L_goodboy/86511455 

网盘链接:

链接:https://pan.baidu.com/s/1kNXonLMxYaql6Hy15ktc7Q 
提取码:tn2n

猜你喜欢

转载自blog.csdn.net/L_goodboy/article/details/126745569