推荐系统原始数据

This commit is contained in:
jiangzhonglian
2017-05-16 14:25:05 +08:00
parent b9eac31808
commit a3eb02bb1d
11 changed files with 1010913 additions and 0 deletions

View File

@@ -0,0 +1,72 @@
def SplitData(data, M, k, seed):
test = []
train = []
random.seed(seed)
for user, item in data:
if random.randint(0,M) == k:
test.append([user,item])
else:
train.append([user,item])
return train, test
# 准确率
def Precision(train, test, N):
hit = 0
all = 0
for user in train.keys():
tu = test[user]
rank = GetRecommendation(user, N)
for item, pui in rank:
if item in tu:
hit += 1
all += N
return hit / (all * 1.0)
# 召回率
def Recall(train, test, N):
hit = 0
all = 0
for user in train.keys():
tu = test[user]
rank = GetRecommendation(user, N)
for item, pui in rank:
if item in tu:
hit += 1
all += len(tu)
return hit / (all * 1.0)
# 覆盖率
def Coverage(train, test, N):
recommend_items = set()
all_items = set()
for user in train.keys():
for item in train[user].keys():
all_items.add(item)
rank = GetRecommendation(user, N)
for item, pui in rank:
recommend_items.add(item)
return len(recommend_items) / (len(all_items) * 1.0)
# 新颖度
def Popularity(train, test, N):
item_popularity = dict()
for user, items in train.items():
for item in items.keys():
if item not in item_popularity:
item_popularity[item] = 0
item_popularity[item] += 1
ret = 0
n=0
for user in train.keys():
rank = GetRecommendation(user, N)
for item, pui in rank:
ret += math.log(1 + item_popularity[item])
n += 1
ret /= n * 1.0
return ret

View File

@@ -0,0 +1,17 @@
def PersonalRank(G, alpha, root):
rank = dict()
rank = {x:0 for x in G.keys()}
rank[root] = 1
for k in range(20):
tmp = {x:0 for x in G.keys()}
for i, ri in G.items():
for j, wij in ri.items():
if j not in tmp:
tmp[j] = 0
tmp[j] += 0.6 * rank[i] / (1.0 * len(ri))
if j == root:
tmp[j] += 1 - alpha
rank = tmp
return rank

View File

@@ -0,0 +1,172 @@
#-*- coding: utf-8 -*-
'''
Created on 2015-06-22
@author: Lockvictor
'''
import sys, random, math
from operator import itemgetter
random.seed(0)
class ItemBasedCF():
''' TopN recommendation - ItemBasedCF '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie = 20
self.n_rec_movie = 10
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie
@staticmethod
def loadfile(filename):
''' load a file, return a generator. '''
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print >> sys.stderr, 'loading %s(%s)' % (filename, i)
fp.close()
print >> sys.stderr, 'load %s succ' % filename
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating, _ = line.split('::')
# split the data by pivot
if (random.random() < pivot):
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print >> sys.stderr, 'split training set and test set succ'
print >> sys.stderr, 'train set = %s' % trainset_len
print >> sys.stderr, 'test set = %s' % testset_len
def calc_movie_sim(self):
''' calculate movie similarity matrix '''
print >> sys.stderr, 'counting movies number and popularity...'
for user, movies in self.trainset.iteritems():
for movie in movies:
# count item popularity
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print >> sys.stderr, 'count movies number and popularity succ'
# save the total number of movies
self.movie_count = len(self.movie_popular)
print >> sys.stderr, 'total movie number = %d' % self.movie_count
# count co-rated users between items
itemsim_mat = self.movie_sim_mat
print >> sys.stderr, 'building co-rated users matrix...'
for user, movies in self.trainset.iteritems():
for m1 in movies:
for m2 in movies:
if m1 == m2: continue
itemsim_mat.setdefault(m1,{})
itemsim_mat[m1].setdefault(m2,0)
itemsim_mat[m1][m2] += 1
print >> sys.stderr, 'build co-rated users matrix succ'
# calculate similarity matrix
print >> sys.stderr, 'calculating movie similarity matrix...'
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.iteritems():
for m2, count in related_movies.iteritems():
itemsim_mat[m1][m2] = count / math.sqrt(
self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count
print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ'
print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count
def recommend(self, user):
''' Find K similar movies and recommend N movies. '''
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.iteritems():
for related_movie, w in sorted(self.movie_sim_mat[movie].items(),
key=itemgetter(1), reverse=True)[:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += w * rating
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
def evaluate(self):
''' return precision, recall, coverage and popularity '''
print >> sys.stderr, 'Evaluation start...'
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print >> sys.stderr, 'recommended for %d users' % i
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, w in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \
% (precision, recall, coverage, popularity)
if __name__ == '__main__':
ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat'
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()

View File

@@ -0,0 +1,41 @@
# 负样本采样过程
def RandomSelectNegativeSample(self, items):
ret = dict()
for i in items.keys():
ret[i] = 1
n = 0
for i in range(0, len(items) * 3):
item = items_pool[random.randint(0, len(items_pool) - 1)]
if item in ret:
continue
ret[item] = 0
n + = 1
if n > len(items):
break
return ret
def LatentFactorModel(user_items, F, N, alpha, lambda):
[P, Q] = InitModel(user_items, F)
for step in range(0,N):
for user, items in user_items.items():
samples = RandSelectNegativeSamples(items)
for item, rui in samples.items():
eui = rui - Predict(user, item)
for f in range(0, F):
P[user][f] += alpha * (eui * Q[item][f] - lambda * P[user][f])
Q[item][f] += alpha * (eui * P[user][f] - lambda * Q[item][f])
alpha *= 0.9
def Recommend(user, P, Q):
rank = dict()
for f, puf in P[user].items():
for i, qfi in Q[f].items():
if i not in rank:
rank[i] += puf * qfi
return rank

View File

@@ -0,0 +1,178 @@
#-*- coding: utf-8 -*-
'''
Created on 2015-06-22
@author: Lockvictor
'''
import sys, random, math
from operator import itemgetter
random.seed(0)
class UserBasedCF():
''' TopN recommendation - UserBasedCF '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_user = 20
self.n_rec_movie = 10
self.user_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print >> sys.stderr, 'Similar user number = %d' % self.n_sim_user
print >> sys.stderr, 'recommended movie number = %d' % self.n_rec_movie
@staticmethod
def loadfile(filename):
''' load a file, return a generator. '''
fp = open(filename, 'r')
for i,line in enumerate(fp):
yield line.strip('\r\n')
if i%100000 == 0:
print >> sys.stderr, 'loading %s(%s)' % (filename, i)
fp.close()
print >> sys.stderr, 'load %s succ' % filename
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating, timestamp = line.split('::')
# split the data by pivot
if (random.random() < pivot):
self.trainset.setdefault(user,{})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user,{})
self.testset[user][movie] = int(rating)
testset_len += 1
print >> sys.stderr, 'split training set and test set succ'
print >> sys.stderr, 'train set = %s' % trainset_len
print >> sys.stderr, 'test set = %s' % testset_len
def calc_user_sim(self):
''' calculate user similarity matrix '''
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print >> sys.stderr, 'building movie-users inverse table...'
movie2users = dict()
for user,movies in self.trainset.iteritems():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print >> sys.stderr, 'build movie-users inverse table succ'
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print >> sys.stderr, 'total movie number = %d' % self.movie_count
# count co-rated items between users
usersim_mat = self.user_sim_mat
print >> sys.stderr, 'building user co-rated movies matrix...'
for movie,users in movie2users.iteritems():
for u in users:
for v in users:
if u == v: continue
usersim_mat.setdefault(u,{})
usersim_mat[u].setdefault(v,0)
usersim_mat[u][v] += 1
print >> sys.stderr, 'build user co-rated movies matrix succ'
# calculate similarity matrix
print >> sys.stderr, 'calculating user similarity matrix...'
simfactor_count = 0
PRINT_STEP = 2000000
for u,related_users in usersim_mat.iteritems():
for v,count in related_users.iteritems():
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print >> sys.stderr, 'calculating user similarity factor(%d)' % simfactor_count
print >> sys.stderr, 'calculate user similarity matrix(similarity factor) succ'
print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count
def recommend(self, user):
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
# v=similar user, wuv=similarity factor
for v, wuv in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[v]:
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie,0)
rank[movie] += wuv
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' return precision, recall, coverage and popularity '''
print >> sys.stderr, 'Evaluation start...'
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print >> sys.stderr, 'recommended for %d users' % i
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, w in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0*rec_count)
recall = hit / (1.0*test_count)
coverage = len(all_rec_movies) / (1.0*self.movie_count)
popularity = popular_sum / (1.0*rec_count)
print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % \
(precision, recall, coverage, popularity)
if __name__ == '__main__':
ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat'
usercf = UserBasedCF()
usercf.generate_dataset(ratingfile)
usercf.calc_user_sim()
usercf.evaluate()

View File

@@ -0,0 +1,64 @@
def ItemSimilarity1(train):
#calculate co-rated users between items
C = dict()
N = dict()
for u, items in train.items():
for i in users:
N[i] += 1
for j in users:
if i == j:
continue
C[i][j] += 1
#calculate finial similarity matrix W
W = dict()
for i,related_items in C.items():
for j, cij in related_items.items():
W[u][v] = cij / math.sqrt(N[i] * N[j])
return W
def ItemSimilarity2(train):
#calculate co-rated users between items
C = dict()
N = dict()
for u, items in train.items():
for i in users:
N[i] += 1
for j in users:
if i == j:
continue
C[i][j] += 1 / math.log(1 + len(items) * 1.0)
#calculate finial similarity matrix W
W = dict()
for i,related_items in C.items():
for j, cij in related_items.items():
W[u][v] = cij / math.sqrt(N[i] * N[j])
return W
def Recommendation1(train, user_id, W, K):
rank = dict()
ru = train[user_id]
for i,pi in ru.items():
for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]:
if j in ru:
continue
rank[j] += pi * wj
return rank
def Recommendation2(train, user_id, W, K):
rank = dict()
ru = train[user_id]
for i,pi in ru.items():
for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]:
if j in ru:
continue
rank[j].weight += pi * wj
rank[j].reason[i] = pi * wj
return rank

View File

@@ -0,0 +1,78 @@
def UserSimilarity1(train):
W = dict()
for u in train.keys():
for v in train.keys():
if u == v:
continue
W[u][v] = len(train[u] & train[v])
W[u][v] /= math.sqrt(len(train[u]) * len(train[v]) * 1.0)
return W
def UserSimilarity2(train):
# build inverse table for item_users
item_users = dict()
for u, items in train.items():
for i in items.keys():
if i not in item_users:
item_users[i] = set()
item_users[i].add(u)
#calculate co-rated items between users
C = dict()
N = dict()
for i, users in item_users.items():
for u in users:
N[u] += 1
for v in users:
if u == v:
continue
C[u][v] += 1
#calculate finial similarity matrix W
W = dict()
for u, related_users in C.items():
for v, cuv in related_users.items():
W[u][v] = cuv / math.sqrt(N[u] * N[v])
return W
def UserSimilarity3(train):
# build inverse table for item_users
item_users = dict()
for u, items in train.items():
for i in items.keys():
if i not in item_users:
item_users[i] = set()
item_users[i].add(u)
#calculate co-rated items between users
C = dict()
N = dict()
for i, users in item_users.items():
for u in users:
N[u] += 1
for v in users:
if u == v:
continue
C[u][v] += 1 / math.log(1 + len(users))
#calculate finial similarity matrix W
W = dict()
for u, related_users in C.items():
for v, cuv in related_users.items():
W[u][v] = cuv / math.sqrt(N[u] * N[v])
return W
def Recommend(user, train, W):
rank = dict()
interacted_items = train[user]
for v, wuv in sorted(W[u].items, key=itemgetter(1), reverse=True)[0:K]:
for i, rvi in train[v].items:
if i in interacted_items:
#we should filter items user interacted before
continue
rank[i] += wuv * rvi
return rank