Copyright D. Poole 2017. You may use it under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. See: http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en
To use this you need to download http://files.grouplens.org/datasets/movielens/ml-100k.zip See http://grouplens.org/datasets/movielens/ Run this notebook (using "jupyter notebook" command) in the directory that contains the ml-100k directory.
The following reads the ratings file and selects temporally first 60000 ratings. It trains on the users who were involved in first 40000 ratings. It tests on the other users who rated.
with open("ml-100k/u.data",'r') as ratingsfile:
all_ratings = (tuple(int(e) for e in line.strip().split('\t'))
for line in ratingsfile)
ratings = [eg for eg in all_ratings if eg[3] <= 884673930]
all_users = {u for (u,i,r,d) in ratings}
print("There are ",len(ratings),"ratings and",len(all_users),"users")
training_users = {u for (u,i,r,d) in ratings if d <= 880845177}
test_users = all_users - training_users
# extract the training and test dictionaries
with open("ml-100k/u.user",'r') as usersfile:
user_info = (line.strip().split('|') for line in usersfile)
gender_train, gender_test = {},{}
for (u,a,g,o,p) in user_info:
if int(u) in training_users:
gender_train[int(u)]=g
elif int(u) in test_users:
gender_test[int(u)]=g
# check the results
assert len(training_users)==len(gender_train)
assert len(test_users)==len(gender_test)
print("There are ",len(gender_train),"users for training")
print("There are ",len(gender_test),"users for testing")
# nf_tr = number in training set females
nf_tr = len({u for (u,g) in gender_train.items() if g=='F'})
# tot_tr = total number of training users
tot_tr = len(gender_train)
print("Proportion of training who are female",nf_tr,'/',tot_tr,'=',nf_tr/tot_tr)
The following function can be used to evaluate your predictor on the test set. Your predictor may use ratings and gender_train but not gender_test. Your predictor should take a user and a second parameter called "para" that is a parameter that can be varied.
import math
def evaluate(pred,para=1):
"""pred is a function from users into real numbers that gives prediction P(u)='F',
returns (sum_squares_error, log loss)"""
sse = sum((pred(u,para=para)-(1 if g=="F" else 0))**2
for (u,g) in gender_test.items())
ll = -sum(math.log(pr,2) if g=='F' else math.log(1-pr,2)
for (u,g) in gender_test.items()
for pr in [pred(u,para=para)])
return (sse,ll)
def pred_ave(u, para=1):
"""predict average where para is the pseudo-count"""
return (nf_tr+para)/(tot_tr+2*para)
print("Errors for predicting average from training set",evaluate(pred_ave))
def pred_05(u, para=1):
"""predict 0.5"""
return 0.5
print("Errors for predicting 0.5",evaluate(pred_05))
movie_stats = {} # movie -> (#f, #m) dictionary
for (u,i,r,d) in ratings:
if u in gender_train:
if i in movie_stats:
(nf, nm) = movie_stats[i]
else:
(nf, nm) = (0,0)
if gender_train[u]=="F":
movie_stats[i] = (nf+1,nm)
if gender_train[u]=="M":
movie_stats[i] = (nf,nm+1)
user_stats = {} # user -> [movie] dictionary
for (u,i,r,d) in ratings:
if u not in user_stats:
user_stats[u] = []
user_stats[u].append(i)
def pred1(u,para=1): # para is the prior count
"""returns sum_m nf/(sum_m nf + nm) where sum is over the movies u rated
para provides a prior count to which other stats are added
"""
fc = para*nf_tr/tot_tr # female count
mc = para*(1 - nf_tr/tot_tr) # male count
for m in user_stats[u]: # for each movie m that u rated
if m in movie_stats: # ignore movies which no training user rated
(nf,nm) = movie_stats[m]
# nf is #females and nm is #males in current movie
fc += nf
mc += nm
return fc/(fc+mc)
def pred2(u, para=1): # para is the prior
"""returns the average of the proportion of females of the movies that u rated"""
tot, cnt = (0,0)
for m in user_stats[u]:
if m in movie_stats:
(nf,nm) = movie_stats[m]
tot += (nf+para)/(nf+nm+2*para)
cnt += 1
return tot/cnt
print("Errors for pred 1",evaluate(pred1))
print("Errors for pred 2",evaluate(pred2))
%matplotlib inline
import matplotlib.pyplot as plt
def plotpc(which=1): #which=0 is sse, which=1 is logloss
#plt.ion() # make it interactive
plt.xlabel("pseudocount")
plt.ylabel(("Log Loss" if which else "sum-squares-error")+" (lower is better)")
plt.xscale('linear') # Makes a 'log' or 'linear' scale
xvalues = [x/10 for x in range(100)]
yps1,yps2,yts = [],[],[]
for x in xvalues:
eval1 = evaluate(pred1, para=x)
yps1.append(eval1[which])
eval2 = evaluate(pred2, para=x)
yps2.append(eval2[which])
evalt = evaluate(pred_ave, para=x)
yts.append(evalt[which])
plt.plot(xvalues, yps1, label="pred 1")
plt.plot(xvalues, yps2, label="pred 2")
plt.plot(xvalues, yts, label="tivial") # use the doc string of the function
plt.legend(loc="upper right") # display the legend
print("Done!")
plotpc()
plotpc(0) ## sum-squares-error
movie_rating_stats[m][r] gives a pair (#females, # males) who gave a rating of r for movie m
movie_rating_stats = {} # movie, rating -> (#f, #m) dictionary
for (u,i,r,d) in ratings:
if u in gender_train:
if i not in movie_rating_stats:
movie_rating_stats[i] = [(0,0) for i in range(6)]
(nf, nm) = movie_rating_stats[i][r]
if gender_train[u]=="F":
movie_rating_stats[i][r] = (nf+1,nm)
if gender_train[u]=="M":
movie_rating_stats[i][r] = (nf,nm+1)
user_to_movie_and_rating[u] gives the list of (m,r) pairs such that u rated movie m with a rating of r
user_to_movie_and_rating = {} # user -> [(movie,rating)] dictionary
for (u,i,r,d) in ratings:
if u not in user_to_movie_and_rating:
user_to_movie_and_rating[u] = []
user_to_movie_and_rating[u].append((i,r))
def pred3(u,para=1):
"""returns sum_m nf/(sum_m nf + nm) where sum is over the movies rated the same as u
nf is the number of females and nm is the number of males in current movie"""
fc,mc = para*nf_tr/tot_tr, para*(1-nf_tr/tot_tr)
for m,r in user_to_movie_and_rating[u]:
if m in movie_rating_stats:
(nf,nm) = movie_rating_stats[m][r]
fc += nf
mc += nm
return fc/(fc+mc)
def pred4(u, para=1):
"""returns the average of the proportion of females of the movies that rated same as u"""
tot, cnt = para*nf_tr/tot_tr, para
for (m,r) in user_to_movie_and_rating[u]:
if m in movie_rating_stats:
(nf,nm) = movie_rating_stats[m][r]
tot += (nf+para)/(nf+nm+2*para)
cnt += 1
return tot/cnt
print("Errors for pred 3",evaluate(pred3))
print("Errors for pred 4",evaluate(pred4))
def plotpc2(which=1):
#plt.ion() # make it interactive
plt.xlabel("pseudocount")
plt.ylabel("Log Loss (lower is better)")
plt.xscale('linear') # Makes a 'log' or 'linear' scale
xvalues = [x/10 for x in range(1,100)]
yps2,yps3,yps4 = [],[],[]
for x in xvalues:
eval2 = evaluate(pred2, para=x)
yps2.append(eval2[which])
eval3 = evaluate(pred3, para=x)
yps3.append(eval3[which])
eval4 = evaluate(pred4, para=x)
yps4.append(eval4[which])
plt.plot(xvalues, yps4, label="pred 4")
plt.plot(xvalues, yps2, label="pred 2")
plt.plot(xvalues, yps3, label="pred 3")
plt.legend(loc="upper right") # display the legend
print("Done!")
plotpc2()
plotpc2(0) # sum-squares-error