forked from proto-n/recsys-challenge-2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path7_rest.py
117 lines (89 loc) · 4.16 KB
/
7_rest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import numpy as np
import scipy.sparse as spl
from concurrent.futures import ProcessPoolExecutor
import sys
threads = 4
all_tasks = [
[5, 8000, ['5t', '5nt'], 0.352],
[10, 12000, ['10t', '10nt'], 0.38],
[25, 40000, ['25f'], 0.43386578246281293],
[25, 9000, ['25r'], 0.4],
[100, 4000, ['100r'], 0.39],
]
split, knn_k, test_task, powb = all_tasks[int(sys.argv[1])]
def recode(column, min_val=0):
uniques = column.unique()
codes = range(min_val, len(uniques) + min_val)
code_map = dict(zip(uniques, codes))
return (column.map(code_map), code_map)
def reverse_code(column, code_map):
inv_map = {v: k for k, v in code_map.items()}
return column.map(inv_map)
playlist_meta = pd.read_csv('data/million_playlist_dataset/playlist_meta.csv')
playlist_meta_c = pd.read_csv('data/challenge_set/playlist_meta.csv')
playlist_meta = pd.concat([playlist_meta, playlist_meta_c], axis=0, ignore_index=True)
song_meta = pd.read_csv('data/million_playlist_dataset/song_meta_no_duplicates.csv')
playlist_meta['pid_code'], pid_codes = recode(playlist_meta['pid'])
song_meta['song_code'], song_codes = recode(song_meta['song_id'])
train = pd.read_csv('data/million_playlist_dataset/playlists.csv')
test = pd.read_csv('data/challenge_set/playlists.csv')
test_tasks = pd.read_csv('data/challenge_set/playlist_meta_tasks.csv')
test_tasks_pids = test_tasks[test_tasks.task.isin(test_task)].pid.unique()
test = test[test.pid.isin(test_tasks_pids)].copy()
train['pid_code'] = train['pid'].map(pid_codes)
train['song_code'] = train['song_id'].map(song_codes)
train.sort_values(['pid_code', 'song_code'], inplace=True)
test['pid_code'] = test['pid'].map(pid_codes)
test['song_code'] = test['song_id'].map(song_codes)
train_agg = train.drop_duplicates(subset=['pid_code', 'song_code']).copy()
test_agg = test.drop_duplicates(subset=['pid_code', 'song_code']).copy()
train_agg['val'] = 1
test_agg['val'] = 1
train_agg['val_stoch'] = train_agg.groupby('pid_code').val.transform(lambda x: x / np.linalg.norm(x))
test_agg['val_stoch'] = test_agg.groupby('pid_code').val.transform(lambda x: x / np.linalg.norm(x))
test_agg_pop = test_agg.join(train.song_code.value_counts().rename('pop'), on='song_code')
test_agg_pop['pop'].fillna(1, inplace=True)
sp_A = spl.coo_matrix((train_agg['val_stoch'].values.T, train_agg[['pid_code', 'song_code']].values.T))
sp_A._shape = (int(playlist_meta.pid_code.max() + 1), int(song_meta.song_code.max() + 1))
sp_A = sp_A.tocsr()
sp_A_t = sp_A.T
sp_A_const = spl.coo_matrix((train_agg['val'].values.T, train_agg[['pid_code', 'song_code']].values.T))
sp_A_const._shape = (int(playlist_meta.pid_code.max() + 1), int(song_meta.song_code.max() + 1))
sp_A_const = sp_A_const.tocsr()
sp_A_const_t = sp_A_const.T
plusadd = 0
def recs_for_ids(ids_):
dfs = []
ndcgs = []
for pid_ in ids_:
p1 = test_agg_pop[(test_agg_pop.pid_code == pid_)]
np_p1 = np.zeros([int(song_meta.song_code.max() + 1), 1])
np_p1[p1.song_code.values] = p1[['val_stoch']].values / ((p1[['pop']].values - 1)**(powb) + 1)
simpls = sp_A.dot(np_p1)
simpls2 = np.zeros_like(simpls)
inds = simpls.reshape(-1).argsort()[-knn_k:][::-1]
vals = simpls[inds]
m = np.max(vals)
if(m == 0):
m += 0.01
vals2 = ((vals - np.min(vals)) * (1 / m) + plusadd)**2
simpls2[inds] = vals2
tmp = sp_A_const_t[:, inds].dot(vals2)
indices_np = tmp.reshape(-1).argsort()[-(500 + split):][::-1]
indices_np = indices_np[np.isin(indices_np, p1.song_code) == False][:500]
dfs.append(pd.DataFrame({
'pid': np.repeat(pid_, 500),
'pos': np.arange(500),
'song_id': indices_np,
'score': tmp[indices_np, 0]
}))
recdf = pd.concat(dfs, axis=0)
recdf['pid'] = reverse_code(recdf['pid'], pid_codes)
recdf['song_id'] = reverse_code(recdf['song_id'], song_codes)
return (recdf, ndcgs)
pool = ProcessPoolExecutor(threads)
res = list(pool.map(recs_for_ids, np.array_split(test_agg.pid_code.unique(), threads)))
pool.shutdown()
recdf = pd.concat([r[0] for r in res], axis=0)
recdf.to_csv('output/%s.csv' % "-".join(test_task), index=False)