基于SVD的推荐系统实现

唐宇迪老师推荐算法实战课程的源码复现,原理还未仔细解释 不足之处望多多指正。

导入所需工具包

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader,SVD,Dataset
from surprise.model_selection import train_test_split

加载评分数据,(这里只过滤筛选一部分数据用于实验)

ratings=pd.read_csv("ratings.csv")
ratings=ratings.query('rating>=3')
ratings.reset_index(drop=True,inplace=True)
ratings.head()
userIdmovieIdratingtimestamp
0123.51112486027
11293.51112484676
21323.51112484819
31473.51112484727
41503.51112484580

数据过滤:过滤出评分超过1000部电影的用户,方便起见避免出现稀疏矩阵的情况

n=1000
users=ratings.userId.value_counts()
users=users[users>n].index.tolist()
ratings=ratings.query('userId in @users')
print(ratings.shape)
ratings.head()
(1317902, 4)
userIdmovieIdratingtimestamp
1591815615.01037739266
1591915625.01040937649
1592015643.01038801803
1592115653.01040944583
1592215664.01037822117

从movies.csv中拿出对应的数据的(电影的title、genres)

movies=pd.read_csv('movies.csv')
print(movies.shape)
movies.head()
(27278, 3)
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
# 得到上表ratings表里对应的电影特征数据
rated_movies=ratings.movieId.tolist()
movies=movies.query('movieId in @rated_movies')
movies.set_index('movieId',inplace=True,drop=True)

制作数据集里的题材特征(将两张表实现关联)

movies=movies.genres.str.split("|",expand=True)
movies.reset_index(inplace=True)
movies.head()
movieId0123456789
01AdventureAnimationChildrenComedyFantasyNoneNoneNoneNoneNone
12AdventureChildrenFantasyNoneNoneNoneNoneNoneNoneNone
23ComedyRomanceNoneNoneNoneNoneNoneNoneNoneNone
34ComedyDramaRomanceNoneNoneNoneNoneNoneNoneNone
45ComedyNoneNoneNoneNoneNoneNoneNoneNoneNone

Long tail绘图

  • 使用工具包中的recmetics函数
  • 其中percentage是红色虚线的百分比
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15,7))
recmetrics.long_tail_plot(df= ratings,
                         item_id_column='movieId',
                         interaction_type='movie ratings',
                         percentage=0.5,
                         x_labels=False)

在这里插入图片描述

利用surprise包构建推荐系统

# 获取surprise所需要格式的数据
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
# 切分数据集
trainset,testset=train_test_split(data,test_size=0.25)
#SVD训练数据
algo=SVD()
algo.fit(trainset)
<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1af42397080>
# 测试
test=algo.test(testset)
test=pd.DataFrame(test)
test.drop("details",inplace=True,axis=1)
test.columns=['userId','movieId','actual','predictions']
test.head()
userIdmovieIdactualpredictions
096601344053.53.892328
11191486084.54.328727
213045929174.03.565206
39401322874.03.967104
43298493.03.383482
# MSE和RMSE评估指标
print(recmetrics.mse(test.actual,test.predictions))
print(recmetrics.rmse(test.actual,test.predictions))
0.26556372554189756
0.5153287548176383
#模型创建
cf_model=test.pivot_table(index='userId',columns='movieId',values='predictions').fillna(0)
cf_model.head()
movieId12345678910...131021131106131118131122131126131132131168131174131176131250
userId
1560.0000000.00.0000000.00.04.3187010.00.00.04.106054...0.00.00.00.00.00.00.00.00.00.0
2084.0805670.00.0000000.00.00.0000000.00.00.00.000000...0.00.00.00.00.00.00.00.00.00.0
3594.1439840.00.0000000.00.00.0000000.00.00.00.000000...0.00.00.00.00.00.00.00.00.00.0
3940.0000000.00.0000000.00.00.0000000.00.00.00.000000...0.00.00.00.00.00.00.00.00.00.0
5720.0000000.03.3661220.00.00.0000000.00.00.00.000000...0.00.00.00.00.00.00.00.00.00.0

5 rows × 16632 columns

# 推荐系统设计
def get_users_predictions(user_id,n,model):
    recmmended_items=pd.DataFrame(model.loc[user_id])
    recmmended_items.columns=['predicted_rating']
    recmmended_items=recmmended_items.sort_values('predicted_rating',ascending=False)
    recmmended_items=recmmended_items.head(n)
    return recmmended_items.index.tolist()
# 使用举例
get_users_predictions(156,10,cf_model)
[2028, 2762, 1198, 1704, 1242, 593, 1210, 919, 2268, 1136]
# 批量的测试结果
test = test.copy().groupby('userId')['movieId'].agg({'actual':(lambda x:list(set(x)))})
cf_recs=[]=[]
for user in test.index:
    cf_predictions=get_users_predictions(user,10,cf_model)
    cf_recs.append(cf_predictions)
    
test['cf_predictions']=cf_recs
test.head()
actualcf_predictions
userId
156[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...
208[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...
359[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...
394[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...
572[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...

实验对比

#排行榜
popularity_recs = ratings.movieId.value_counts().head(10).index.tolist()

pop_recs=[]
for user in test.index:
    pop_predictions=popularity_recs
    pop_recs.append(pop_predictions)
    
test['pop_predictions']=pop_recs
test.head()
actualcf_predictionspop_predictions
userId
156[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
208[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
359[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
394[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
572[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
# 随机选择
ran_recs=[]
for user in test.index:
    random_predictions = ratings.movieId.sample(10).values.tolist()
    ran_recs.append(random_predictions)
test['random_predictions']=ran_recs
test.head()
actualcf_predictionspop_predictionsrandom_predictions
userId
156[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...[6310, 88125, 7458, 3182, 60684, 5582, 2975, 1...
208[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...[3823, 2109, 6924, 5120, 4238, 1407, 5266, 299...
359[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...[57421, 1032, 8379, 2539, 5010, 2100, 111, 196...
394[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...[77201, 1372, 7484, 3250, 521, 1396, 5971, 260...
572[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...[5872, 6982, 832, 4495, 70742, 65596, 1, 971, ...

topK 求精度与召回率Precision与Recall

覆盖率

def prediction_coverage(predicted,catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)*1.0)*100,2)
    return prediction_coverage
catalog = ratings.movieId.unique().tolist()
random_coverage = prediction_coverage(ran_recs,catalog)
pop_coverage = prediction_coverage(pop_recs,catalog)
cf_coverage = prediction_coverage(cf_recs, catalog)
# 覆盖率
coverage_scores=[random_coverage,pop_coverage,cf_coverage]
model_names= ['Random Recommender','Popular Recommender','Collaborative Fillter']

fig=plt.figure(figsize=(7,5))
recmetrics.coverage_plot(coverage_scores,model_names)

在这里插入图片描述

问题(review)

*  以数据为基础的论文查找技巧;
*  SVD为算法基础的推荐基础的算法原理;(之前接触SVD主要是矩阵计算是将其用于对图像的压缩处理,此处的SVD的推荐排序原理不是很清楚,需要查资料学习);
*  原来(大三阶段)看过的研究文献数据集大多来之kaggle实验过程与唐课程的区别在哪?

版权声明:本文为Zengmeng1998原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。