基于简单的协同过滤和网状推荐系统的实现（python和moviellens数据集）

　　本文的理论基础是上篇我看的《个性化推荐系统的研究进展》（周涛等，2009），这是一篇计较老的综述了，所以里面的方法也比较老：），但是练手不错，最起码实现没那么头疼，主要是电脑能相对的运算xD。原理在我的另一个思维导图里已经有了，在此不再赘述，直接上代码：（本代码力求实现，所以还没有优化，目前采用串行计算，有时间的话会优化）：

　　0，提示：虽然没有用并行算法，但是可以考虑用jit加速。而且很重要的一点是，关联矩阵非常稀疏，要考虑度为零，相似性为零等情况，不然老会得不到结果。

本实例中用的是6040x3952(用户，电影)的数据包。

　　1，用户与产品的关联矩阵、度、评分矩阵的写入：

def GetData():
    user_vec = np.zeros((6040,3952))
    user_rating = np.zeros((6040,3952))
    degree_user = np.zeros((1,6040))
    degree_mov = np.zeros((1,3952))

    rating = np.loadtxt("rating.txt",dtype = np.str,delimiter= "::")
    #print(rating[:2,:3])
    data = np.array(rating[0:,:3],dtype=np.int)


    #写入用户向量和用户、电影的度
    for i in range(0,1000209):
        if data[i][2] > 3:
            user_vec[int(data[i][0])-1][int(data[i][1])-1] = 1
            degree_user[0][data[i][0] - 1] = int(degree_user[0][data[i][0] - 1] +1)
            degree_mov[0][data[i][1] - 1] = int(degree_mov[0][data[i][1] - 1] + 1)
        if data[i][2]:
            user_rating[int(data[i][0]) - 1][int(data[i][1]) - 1] = int(data[i][2])
    return user_vec,user_rating,degree_user,degree_mov

　　2，相似度计算：

def SimCom(user_vec):
    #相似度矩阵
    user_sim = np.zeros((6040, 6040))
    for i in range(0,6040):
        for j in range(i+1,6039):
            user_sim[i][j] = float(np.dot(user_vec[i], user_vec[j]) / (np.linalg.norm(user_vec[i]) * np.linalg.norm(user_vec[i])))
            user_sim[i][j] = round(user_sim[i][j],3)
        user_sim[i][i] = 1
        if i %10 ==0 :
            print(i)

    for i in range(1,6040):
        for j in range(0,i):
            user_sim[i][j] =user_sim[j][i]
    return user_sim

　　3，新的评分矩阵生成：

#新的评分矩阵
'''
在此不考虑新打分矩阵出现后需要迭代运算
'''
@jit
def RecS_rating(user_sim,user_rating,user_vec):
    user_NewRating = np.zeros((6040,3952))
    for i in range(0,6040):
        all_sim = np.sum(user_sim[i])
        degree = np.sum(user_vec[i])
        if degree != 0:
            rating_avg= np.sum(user_rating[i]) / degree
        else:
            rating_avg= 0
        for j in range(0,3952):
            for k in range(0,6040):
                user_NewRating[i][j] = user_NewRating[i][j] + user_sim[i][j] * (user_rating[k][j]-user_rating[i][j])
            if all_sim != 0:
                user_NewRating[i][j] =int(user_NewRating[i][j]/all_sim+ rating_avg)
            else:
                user_NewRating [i][j] = 5
            if user_NewRating[i][j] > 5:
                user_NewRating[i][j] = 5
            if user_NewRating[i][j] <0:
                user_NewRating[i][j] = 0
                # 显示计算到哪
        if i % 100 == 0:
            print(i)
    return  user_NewRating

　　4，资源分配矩阵生成：

#资源配额矩阵
def Wight_mov(degree_user,degree_mov,user_vec):
    w_mov = np.zeros((3952, 3952))
    for i in range(0,3952):
        for j in range(0,3952):
            w_mov[i][j] = 0
            for k in range(0,6040):
                if degree_user[0][k] == 0:
                    continue
                w_mov[i][j] = w_mov[i][j] + (user_vec[k][i]*user_vec[k][j])/degree_user[0][k]
            if degree_mov[0][j] == 0:
                w_mov[i][j] = 0
                continue
            w_mov[i][j] = round(w_mov[i][j] / degree_mov[0][j],4)
            #print(j)
        #显示计算到哪
        if i %100 ==0 :
            print(i)
    return w_mov

　　5，Top-K(用于选择推荐值或者评分最高的K个值)矩阵：

#top-K排序
def partition_arg_topK(matrix, K, axis=0):

    a_part = np.argpartition(matrix, K, axis=axis)
    #列排序
    if axis == 0:
        row_index = np.arange(matrix.shape[1 - axis])
        a_sec_argsort_K = np.argsort(matrix[a_part[0:K, :], row_index], axis=axis)
        return a_part[0:K, :][a_sec_argsort_K, row_index]
    #行排序
    else:
        column_index = np.arange(matrix.shape[1 - axis])[:, None]
        a_sec_argsort_K = np.argsort(matrix[column_index, a_part[:, 0:K]], axis=axis)
        return a_part[:, 0:K][column_index, a_sec_argsort_K]

　　6，输出参数（以协同过滤为例子：评判标准定为用户已选产品新老评分之差，再取平均）而网状则不同，方法详见另一个思维导图：

correct_corr = np.zeros((1,6040))
    sum_valid = 0
    for i in range(0, 6040):
        degree = np.sum(user_vec[i])
        for k in range(0, 3952):
            if user_vec[i][k] == 1:
                correct_corr[0][i] =correct_corr[0][i]+ (new_rating[i][k] - user_rating[i][k])
        if degree != 0:
            correct_corr[0][i] = correct_corr[0][i] / degree
            sum_valid = sum_valid+1
        else:
            correct_corr[0][i]= 0
    print(np.sum(correct_corr) / sum_valid)

基于简单的协同过滤和网状推荐系统的实现（python和moviellens数据集）

猜你喜欢