[Python嗯~机器学习]---用python3来描述协同过滤

版权声明:允许转载请注明作者 https://blog.csdn.net/kepengs/article/details/85633600

协同过滤

推荐系统

In [1]:

import numpy as np
from scipy.optimize import minimize
import pandas as pd

In [2]:

def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100):
    """
    Args:
        Y - 用户对影片的评分矩阵
        R - 用户j是否对影片i评分的矩阵 (0/1)
        params - 若有初始化参数,可在此传入(Theta, X)
        n - 电影的特征数
        theLambda - 正则化参数
        maxIter - 最大迭代次数
    Returns:
        train - 训练函数
        predict - 预测函数
        getTopRecommends - 获取特定影片的最相似推荐
    """

    # 影片数,用户数
    nm, nu = Y.shape

    # 标准化影片的评分
    mu = np.zeros((Y.shape[0], 1), dtype=np.float)
    for i in range(nm):
        totalRates = np.sum(Y[i])
        validCount = len(np.nonzero(R[i])[0])
        mu[i] = totalRates / validCount
    Y = Y - mu

    def roll(Theta, X):
        """
        对于模型而言,Theta和X都是待学习的参数,需要放在一起直接优化
        Args:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        Returns:
            vector - 折叠后的参数
        """

        #return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))
        return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))

    def unroll(vector):
        """
        Args:
            vector 参数向量
        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        X = np.mat(vector[:nm * n].reshape(n, nm).T)
        Theta = np.mat(vector[nm * n:].reshape(n, nu).T)
        return Theta, X

    def initParams():
        """初始化参数

        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        Theta = np.mat(np.random.rand(nu, n))
        X = np.mat(np.random.rand(nm, n))
        return Theta, X

    def regularize(param):
        """对参数进行正则化
        Args:
            param - 参数
        Return:
            regParam - 正规化后的参数
        """
        return theLambda * 0.5 * np.sum(np.power(param, 2))

    def J(params):
        """代价函数

        Args:
            params - 参数向量
            nu - 用户数
            nm - 电影数
            n - 特征数
        Return:
            J - 预测代价
        """
        # 参数展开
        Theta, X = unroll(params)
        # 计算误差
        rows, cols = np.nonzero(R)
        # 预测
        h = predict(Theta, X)
        diff = h - Y
        diff[R != 1] = 0
        error = 0.5 * np.sum(np.power(diff, 2))
        
        #  正则化 Theta
        regTheta = regularize(Theta)
        #  正规化 x
        regX = regularize(X)
        
        return error + regTheta + regX

    def gradient(params):
        """计算梯度

        Args:
            params - 参数向量
        Returns:
            grad - 梯度向量
        """
        Theta, X = unroll(params)
        
        # 当前梯度初始化成0
        ThetaGrad = np.mat(np.zeros(Theta.shape))
        XGrad = np.mat(np.zeros(X.shape))
        
        error = predict(Theta, X) - Y
        error[R != 1] = 0
        
        # 这里只需要计算梯度
        ThetaGrad = error.T * X + theLambda * Theta
        XGrad =  error * Theta + theLambda * X
        
        return roll(ThetaGrad, XGrad)

    def train():
        """训练

        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        # 初始化参数
        if not params:
            Theta, X = initParams()
        else:
            Theta = params['Theta']
            X = params['X']
            
        # 最小化目标函数
        res = minimize(J, x0=roll(Theta, X), jac=gradient,
                       method='CG', options={'disp': True, 'maxiter': maxIter})
        Theta, X = unroll(res.x)
        return Theta, X

    def predict(Theta, X):
        """预测
        Args:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        Return:
            h 预测
        """
        return X * Theta.T + mu

    def getTopRecommends(Theta, X, i, count, rated, items):
        """获得推荐

        Args:
            Theta - 用户偏好矩阵
            X - 影片特征矩阵
            i - 用户索引
            count - 目标推荐数量
            rated - 已经评价的影片id
            items - 影片库
        Returns:
            topRecommends - 推荐项目
        """
        predictions = predict(Theta, X)[:, i]
        
        # 实用pandas的DataFrame可以将不同类型数据放在一个Frame中,方便排序等操作
        # 相较而言,numpy的多维数组要求内部类型完全一致
        df = pd.DataFrame(data=predictions, columns=['prediction',])
        df['movie'] = items
        df.sort_values(by='prediction', ascending=False,inplace=True)
        # 不推荐已经评过分的影片
        df.drop(rated, inplace=True)
        
        return df[0:count]

    return train, predict, getTopRecommends

In [3]:

from scipy.io import loadmat

In [4]:

data = loadmat('data/ex8_movies.mat')
Y = data['Y']
R = data['R']

In [5]:

movieParams = loadmat('data/ex8_movieParams.mat')
nm = movieParams['num_movies'][0,0]
n = movieParams['num_features'][0,0]

In [6]:

def getMovie(line):
    return b' '.join(line.split()[1:])
movieList = []
with open('data/movie_ids.txt', 'rb') as f:
    for line in f:
        movieList.append(getMovie(line.strip()))

In [7]:

myRatings = np.mat(np.zeros((nm,1)))

myRatings[0] = 4
myRatings[97] = 2
myRatings[6] = 3
myRatings[11] = 5
myRatings[53] = 4
myRatings[63] = 5
myRatings[65] = 3
myRatings[68] = 5
myRatings[182] = 4
myRatings[225] = 5
myRatings[354] = 5

print(u'我的评分:')
for i in range(nm):
    if myRatings[i] > 0:
        print('{:<50} {}'.format( movieList[i].decode('utf-8'), myRatings[i].A[0,0]))
我的评分:
Toy Story (1995)                                   4.0
Twelve Monkeys (1995)                              3.0
Usual Suspects, The (1995)                         5.0
Outbreak (1995)                                    4.0
Shawshank Redemption, The (1994)                   5.0
While You Were Sleeping (1995)                     3.0
Forrest Gump (1994)                                5.0
Silence of the Lambs, The (1991)                   2.0
Alien (1979)                                       4.0
Die Hard 2 (1990)                                  5.0
Sphere (1998)                                      5.0

In [8]:

# 将我们的新用户数据加入
Y = np.column_stack((myRatings, Y))
R = np.column_stack((myRatings, R)).astype(bool)

In [9]:

train, predict, getTopRecommends = getRecommender(
    Y, R, n=n, theLambda=10.0)

In [10]:

Theta, X = train()
Warning: Maximum number of iterations has been exceeded.
         Current function value: 71136.873769
         Iterations: 100
         Function evaluations: 156
         Gradient evaluations: 156

In [11]:

rated = np.nonzero(myRatings)[0].tolist()
# -1 就是我们刚才加入的最新用户
topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList)
topRecommends

Out[11]:

  prediction movie
813 4.760439 b'Great Day in Harlem, A (1994)'
1598 4.249605 b"Someone Else's America (1995)"
1652 3.786093 b'Entertaining Angels: The Dorothy Day Story (...
1535 3.749902 b'Aiqing wansui (1994)'
1499 3.680961 b'Santa with Muscles (1996)'
1200 3.648143 b'Marlene Dietrich: Shadow and Light (1996)'
1121 3.638303 b'They Made Me a Criminal (1939)'
1497 3.398012 b'Farmer & Chase (1995)'
1491 3.396144 b'Window to Paris (1994)'
1612 3.246720 b'Tokyo Fist (1995)'

猜你喜欢

转载自blog.csdn.net/kepengs/article/details/85633600