RecSys Challenge 2022 小试牛刀_0.19+

RecSys Challenge 2022 基于session的推荐挑战赛

今年的挑战主要集中在时尚推荐上。当给定用户会话、购买数据和有关项目的内容数据时,能否准确预测会话结束时将购买的时尚单​​品。

重要的是能够根据用户在当前会话期间所做的事情做出推荐,以创造可能导致购买的最佳体验。时尚领域的细微差别使得准确的会话预测比其他领域更重要:

平均 51% 的访客是新访客(Dressipi 数据),这意味着没有可用的历史数据,我们只能依赖当前的会话活动。
比赛链接

1. 导入相关库

import os
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm

from collections import defaultdict
import math,pickle
import numpy as np

2. 读取数据

train_sessions = pd.read_csv(os.path.join(data_dir, 'dataset/train_sessions.csv'))
train_purchases = pd.read_csv(os.path.join(data_dir, 'dataset/train_purchases.csv'))
test_sessions = pd.read_csv(os.path.join(data_dir, 'dataset/test_leaderboard_sessions.csv'))
test_final = pd.read_csv(os.path.join(data_dir, 'dataset/test_final_sessions.csv'))

candidates_items = pd.read_csv(os.path.join(data_dir, 'dataset/candidate_items.csv'))
candidates_set = set(candidates_items.item_id)

2.1 处理时间

def strip(t):
    t = t.split('.')[0]
    return t

train_sessions['date'] = train_sessions['date'].apply(lambda x:strip(x))
train_purchases['date'] = train_purchases['date'].apply(lambda x:strip(x))
test_sessions['date'] = test_sessions['date'].apply(lambda x:strip(x))
test_final['date'] = test_final['date'].apply(lambda x:strip(x))

train_sessions['date'] = train_sessions['date'].apply(lambda x:time.mktime(datetime.strptime(x,'%Y-%m-%d %H:%M:%S').timetuple()))
train_purchases['date'] = train_purchases['date'].apply(lambda x:time.mktime(datetime.strptime(x,'%Y-%m-%d %H:%M:%S').timetuple()))
test_sessions['date'] = test_sessions['date'].apply(lambda x:time.mktime(datetime.strptime(x,'%Y-%m-%d %H:%M:%S').timetuple()))
test_final['date'] = test_final['date'].apply(lambda x:time.mktime(datetime.strptime(x,'%Y-%m-%d %H:%M:%S').timetuple()))
train_sessions = train_sessions.append(train_purchases)
train_sessions = train_sessions.append(test_sessions)
train_sessions = train_sessions.append(test_final)
order = train_sessions['item_id'].value_counts().reset_index()
order = order.sort_values('item_id', ascending=False)
popular_items = list(order['index'])

计算物品相似度矩阵

def get_sim_item(df_, user_col, item_col, use_iif=False):
    df = df_.copy()
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    user_time_ = df.groupby(user_col)['date'].agg(list).reset_index()  # 引入时间因素
    user_time_dict = dict(zip(user_time_[user_col], user_time_['date']))

    sim_item = {}
    item_cnt = defaultdict(int)  # 商品被点击次数
    for user, items in tqdm(user_item_dict.items()):
        for loc1, item in enumerate(items):
            item_cnt[item] += 1
            sim_item.setdefault(item, {})
            for loc2, relate_item in enumerate(items):
                if item == relate_item:
                    continue
                t1 = user_time_dict[user][loc1]          # 点击时间提取
                t2 = user_time_dict[user][loc2]
                sim_item[item].setdefault(relate_item, 0)
                if not use_iif:
                    
                    # 考虑正向顺序点击和反向顺序点击    
                    loc_alpha = 1.0 if loc2 > loc1 else Reverse_order_weight_alpha   
                    # 位置信息权重
                    loc_weight = loc_alpha * (loc_weight_alpha ** (np.abs(loc2 - loc1) - 1))  # 0.9   0.7
                    # 点击时间权重  
                    click_time_weight = np.exp(click_time_weight_alpha ** np.abs((t1 - t2)/time_xiuzheng))  
                    # 考虑多种因素的权重计算最终的商品之间的相似度
                    sim_item[item][relate_item] += loc_weight * click_time_weight / math.log(len(items) + 1)
                    
                else:
                    sim_item[item][relate_item] += 1 / math.log(1 + len(items))

    sim_item_corr = sim_item.copy()  # 引入AB的各种被点击次数
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / ((item_cnt[i] * item_cnt[j]) ** 0.2)  

    return sim_item_corr

#计算物品相似度矩阵
sim_item_corr = get_sim_item(train_sessions, 'session_id','item_id',use_iif=False)

召回

# 基于商品的召回
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    """
        基于item协同过滤的召回
        :param session_id: session_id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击item序列   {session1: [(item1, time1), (item2, time2)..]...}
        :param i2i_sim: 字典,item相似性矩阵
        :param sim_item_topk: 整数, 选择与当前item最相似的前k篇item
        :param recall_item_num: 整数, 最后的召回item数量
        :param item_topk_click: 列表,点击次数最多的item列表,用户召回补全
        
        return: 召回的item列表 [(item1, score1), (item2, score2)...]
    """
    # 获取用户历史交互的item
    user_hist_items = user_item_time_dict[user_id]
    user_hist_items_ = {user_id for user_id, _ in user_hist_items}
    
    min_time = min([time for item, time in user_hist_items])
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_hist_items_:
                continue
            if j not in candidates_set:
                continue
            # 相似item和历史点击item序列中历史item所在的位置时间权重
            loc_weight = (infer_loc_weight_alpha ** (len(user_hist_items) - loc))  
            time_weight = np.exp(1 * (click_time - min_time)/infer_time_weight_alpha)  
            
            item_rank.setdefault(j, 0)
            item_rank[j] += loc_weight*  wij
            
    if len(item_rank) > 0:
        item_rank = sorted(item_rank.items(), key=lambda d: d[1], reverse=True)[:recall_item_num]
        item_rank = np.array(item_rank)
        item_list = list(item_rank[:,0].astype('int32'))
        score_list = item_rank[:,1]
    else:
        item_list = []
        score_list = []
    
    # 不足100个,用热门商品补全
    if len(item_list) < recall_item_num:
        index = 0
        while(len(item_list)<recall_item_num):
            if item_topk_click[index] in candidates_set:
                item_list.append(item_topk_click[index])
                item_list = list(set(item_list))
                index +=1
            else:
                index +=1
        
    return item_list,score_list
import collections
# 根据点击时间获取用户的点击item序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    click_df = click_df.sort_values('date')
    def make_item_time_pair(df):
        return list(zip(df['item_id'], df['date']))
    
    user_item_time_df = click_df.groupby('session_id')['item_id', 'date'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['session_id'], user_item_time_df['item_time_list']))
    return user_item_time_dict
    
user_item_time_dict = get_user_item_time(test_sessions)
#user_item_time_dict = get_user_item_time(test_final)   # 需要替换的地方

sim_item_topk = 1500
recall_item_num = 100
item_topk_click = popular_items

# 测试一下
item_list,score_list=item_based_recommend(26, user_item_time_dict, sim_item_corr, sim_item_topk,recall_item_num,item_topk_click)
len(item_list)
test_session_dict = test_sessions.groupby('session_id')['item_id'].agg(list).to_dict()  
#test_session_dict = test_final.groupby('session_id')['item_id'].agg(list).to_dict()  
session_id_list = []
item_id_list = []
rank_list = []
for session_id,session_item_list in tqdm(test_session_dict.items()):
    item_list,score_list = item_based_recommend(session_id, user_item_time_dict, sim_item_corr, sim_item_topk,recall_item_num,item_topk_click)
    session_id_list += [session_id for _ in range(len(item_list))]
    item_id_list += list(item_list)
    rank_list += [x for x in range(1,len(item_list)+1)]
res_df = pd.DataFrame()
res_df['session_id'] = session_id_list
res_df['item_id'] = item_id_list
res_df['rank'] = rank_list
res_df

合适的参数,Leaderboard可以到0.19+。小小CF魔力巨大。后续其他方案以及参数整理到github上。


版权声明:本文为weixin_44127327原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。