唐宇迪老师推荐算法实战课程的源码复现，原理还未仔细解释不足之处望多多指正。

导入所需工具包

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader,SVD,Dataset
from surprise.model_selection import train_test_split

加载评分数据，（这里只过滤筛选一部分数据用于实验）

ratings=pd.read_csv("ratings.csv")
ratings=ratings.query('rating>=3')
ratings.reset_index(drop=True,inplace=True)

ratings.head()

	userId	movieId	rating	timestamp
0	1	2	3.5	1112486027
1	1	29	3.5	1112484676
2	1	32	3.5	1112484819
3	1	47	3.5	1112484727
4	1	50	3.5	1112484580

数据过滤：过滤出评分超过1000部电影的用户，方便起见避免出现稀疏矩阵的情况

n=1000
users=ratings.userId.value_counts()
users=users[users>n].index.tolist()

ratings=ratings.query('userId in @users')
print(ratings.shape)
ratings.head()

(1317902, 4)

	userId	movieId	rating	timestamp
15918	156	1	5.0	1037739266
15919	156	2	5.0	1040937649
15920	156	4	3.0	1038801803
15921	156	5	3.0	1040944583
15922	156	6	4.0	1037822117

从movies.csv中拿出对应的数据的（电影的title、genres）

movies=pd.read_csv('movies.csv')
print(movies.shape)
movies.head()

(27278, 3)

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

# 得到上表ratings表里对应的电影特征数据
rated_movies=ratings.movieId.tolist()
movies=movies.query('movieId in @rated_movies')
movies.set_index('movieId',inplace=True,drop=True)

制作数据集里的题材特征（将两张表实现关联）

movies=movies.genres.str.split("|",expand=True)
movies.reset_index(inplace=True)
movies.head()

	movieId	0	1	2	3	4	5	6	7	8	9
0	1	Adventure	Animation	Children	Comedy	Fantasy	None	None	None	None	None
1	2	Adventure	Children	Fantasy	None	None	None	None	None	None	None
2	3	Comedy	Romance	None	None	None	None	None	None	None	None
3	4	Comedy	Drama	Romance	None	None	None	None	None	None	None
4	5	Comedy	None	None	None	None	None	None	None	None	None

Long tail绘图

使用工具包中的recmetics函数
其中percentage是红色虚线的百分比

import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15,7))
recmetrics.long_tail_plot(df= ratings,
                         item_id_column='movieId',
                         interaction_type='movie ratings',
                         percentage=0.5,
                         x_labels=False)

在这里插入图片描述

利用surprise包构建推荐系统

# 获取surprise所需要格式的数据
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
# 切分数据集
trainset,testset=train_test_split(data,test_size=0.25)

#SVD训练数据
algo=SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1af42397080>

# 测试
test=algo.test(testset)
test=pd.DataFrame(test)
test.drop("details",inplace=True,axis=1)
test.columns=['userId','movieId','actual','predictions']
test.head()

	userId	movieId	actual	predictions
0	96601	34405	3.5	3.892328
1	119148	608	4.5	4.328727
2	130459	2917	4.0	3.565206
3	94013	2287	4.0	3.967104
4	32984	9	3.0	3.383482

# MSE和RMSE评估指标
print(recmetrics.mse(test.actual,test.predictions))
print(recmetrics.rmse(test.actual,test.predictions))

0.26556372554189756
0.5153287548176383

#模型创建
cf_model=test.pivot_table(index='userId',columns='movieId',values='predictions').fillna(0)
cf_model.head()

movieId	1	2	3	4	5	6	7	8	9	10	...	131021	131106	131118	131122	131126	131132	131168	131174	131176	131250
userId
156	0.000000	0.0	0.000000	0.0	0.0	4.318701	0.0	0.0	0.0	4.106054	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
208	4.080567	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
359	4.143984	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
394	0.000000	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
572	0.000000	0.0	3.366122	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

5 rows × 16632 columns

# 推荐系统设计
def get_users_predictions(user_id,n,model):
    recmmended_items=pd.DataFrame(model.loc[user_id])
    recmmended_items.columns=['predicted_rating']
    recmmended_items=recmmended_items.sort_values('predicted_rating',ascending=False)
    recmmended_items=recmmended_items.head(n)
    return recmmended_items.index.tolist()

# 使用举例
get_users_predictions(156,10,cf_model)

[2028, 2762, 1198, 1704, 1242, 593, 1210, 919, 2268, 1136]

# 批量的测试结果
test = test.copy().groupby('userId')['movieId'].agg({'actual':(lambda x:list(set(x)))})
cf_recs=[]=[]
for user in test.index:
    cf_predictions=get_users_predictions(user,10,cf_model)
    cf_recs.append(cf_predictions)
    
test['cf_predictions']=cf_recs
test.head()

	actual	cf_predictions
userId
156	[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...	[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...
208	[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...	[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...
359	[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...	[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...
394	[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...	[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...
572	[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...	[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...

实验对比

#排行榜
popularity_recs = ratings.movieId.value_counts().head(10).index.tolist()

pop_recs=[]
for user in test.index:
    pop_predictions=popularity_recs
    pop_recs.append(pop_predictions)
    
test['pop_predictions']=pop_recs
test.head()

	actual	cf_predictions	pop_predictions
userId
156	[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...	[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
208	[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...	[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
359	[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...	[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
394	[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...	[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
572	[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...	[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...

# 随机选择
ran_recs=[]
for user in test.index:
    random_predictions = ratings.movieId.sample(10).values.tolist()
    ran_recs.append(random_predictions)
test['random_predictions']=ran_recs
test.head()

	actual	cf_predictions	pop_predictions	random_predictions
userId
156	[6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ...	[2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...	[6310, 88125, 7458, 3182, 60684, 5582, 2975, 1...
208	[3072, 1, 69122, 2567, 3079, 2570, 44555, 1036...	[912, 608, 924, 1207, 898, 922, 1256, 44555, 7...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...	[3823, 2109, 6924, 5120, 4238, 1407, 5266, 299...
359	[1, 32770, 515, 39427, 2565, 37382, 4103, 6964...	[1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...	[57421, 1032, 8379, 2539, 5010, 2100, 111, 196...
394	[1537, 33794, 26116, 3077, 4617, 2058, 2571, 5...	[858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...	[77201, 1372, 7484, 3250, 521, 1396, 5971, 260...
572	[3, 108548, 7173, 4104, 54281, 91658, 2571, 30...	[2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...	[1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...	[5872, 6982, 832, 4495, 70742, 65596, 1, 971, ...

topK 求精度与召回率Precision与Recall

覆盖率

def prediction_coverage(predicted,catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)*1.0)*100,2)
    return prediction_coverage

catalog = ratings.movieId.unique().tolist()
random_coverage = prediction_coverage(ran_recs,catalog)
pop_coverage = prediction_coverage(pop_recs,catalog)
cf_coverage = prediction_coverage(cf_recs, catalog)

# 覆盖率
coverage_scores=[random_coverage,pop_coverage,cf_coverage]
model_names= ['Random Recommender','Popular Recommender','Collaborative Fillter']

fig=plt.figure(figsize=(7,5))
recmetrics.coverage_plot(coverage_scores,model_names)

在这里插入图片描述

问题（review）

*  以数据为基础的论文查找技巧；
*  SVD为算法基础的推荐基础的算法原理；（之前接触SVD主要是矩阵计算是将其用于对图像的压缩处理，此处的SVD的推荐排序原理不是很清楚，需要查资料学习）；
*  原来（大三阶段）看过的研究文献数据集大多来之kaggle实验过程与唐课程的区别在哪？

原文链接：https://blog.csdn.net/Zengmeng1998/article/details/113839959