CPSC 532P 2017 - Predicting Gender from Ratings

Copyright D. Poole 2017. You may use it under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. See: http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en

Load the files.

To use this you need to download http://files.grouplens.org/datasets/movielens/ml-100k.zip See http://grouplens.org/datasets/movielens/ Run this notebook (using "jupyter notebook" command) in the directory that contains the ml-100k directory.

The following reads the ratings file and selects temporally first 60000 ratings. It trains on the users who were involved in first 40000 ratings. It tests on the other users who rated.

In [1]:
with open("ml-100k/u.data",'r') as ratingsfile:
    all_ratings = (tuple(int(e) for e in line.strip().split('\t'))
                        for line in ratingsfile)
    ratings = [eg for eg in all_ratings if eg[3] <= 884673930]
    all_users = {u for (u,i,r,d) in ratings}
    print("There are ",len(ratings),"ratings and",len(all_users),"users")
    training_users = {u for (u,i,r,d) in ratings if d <= 880845177}
    test_users = all_users - training_users

# extract the training and test dictionaries
with open("ml-100k/u.user",'r') as usersfile:
    user_info = (line.strip().split('|') for line in usersfile)
    gender_train, gender_test = {},{}
    for (u,a,g,o,p) in user_info:
        if int(u) in training_users:
            gender_train[int(u)]=g
        elif int(u) in test_users:
            gender_test[int(u)]=g

# check the results
assert len(training_users)==len(gender_train)
assert len(test_users)==len(gender_test)
print("There are ",len(gender_train),"users for training")
print("There are ",len(gender_test),"users for testing")
There are  60000 ratings and 590 users
There are  419 users for training
There are  171 users for testing
In [2]:
# nf_tr = number in training set females
nf_tr = len({u for (u,g) in gender_train.items() if g=='F'})
# tot_tr = total number of training users
tot_tr = len(gender_train)
print("Proportion of training who are female",nf_tr,'/',tot_tr,'=',nf_tr/tot_tr)
Proportion of training who are female 111 / 419 = 0.2649164677804296

Evaluation

The following function can be used to evaluate your predictor on the test set. Your predictor may use ratings and gender_train but not gender_test. Your predictor should take a user and a second parameter called "para" that is a parameter that can be varied.

In [3]:
import math
def evaluate(pred,para=1):
    """pred is a function from users into real numbers that gives prediction P(u)='F',
    returns (sum_squares_error,  log loss)"""
    sse = sum((pred(u,para=para)-(1 if g=="F" else 0))**2
                  for (u,g) in gender_test.items())
    ll = -sum(math.log(pr,2) if g=='F' else math.log(1-pr,2)
                  for (u,g) in gender_test.items()
                  for pr in [pred(u,para=para)])
    return (sse,ll)

The evaluation of two naive predicters

In [4]:
def pred_ave(u, para=1):
    """predict average where para is the pseudo-count"""
    return (nf_tr+para)/(tot_tr+2*para)
print("Errors for predicting average from training set",evaluate(pred_ave))
def pred_05(u, para=1):
    """predict 0.5"""
    return 0.5
print("Errors for predicting 0.5",evaluate(pred_05))
Errors for predicting average from training set (36.902776445630444, 153.9002201825844)
Errors for predicting 0.5 (42.75, 171.0)

Averaging from movies that were rated

In [5]:
movie_stats = {}  # movie -> (#f, #m) dictionary
for (u,i,r,d) in ratings:
  if u in gender_train:
    if i in movie_stats:
        (nf, nm) = movie_stats[i]
    else:
        (nf, nm) = (0,0)
    if gender_train[u]=="F":
        movie_stats[i] = (nf+1,nm)
    if gender_train[u]=="M":
        movie_stats[i] = (nf,nm+1)


user_stats = {} # user -> [movie] dictionary
for (u,i,r,d) in ratings:
    if u not in user_stats:
        user_stats[u] = []
    user_stats[u].append(i)
In [6]:
def pred1(u,para=1):   # para is the prior count
    """returns sum_m nf/(sum_m nf + nm) where sum is over the movies u rated
    para provides a prior count to which other stats are added
    """
    fc = para*nf_tr/tot_tr       # female count
    mc = para*(1 - nf_tr/tot_tr)   # male count
    for m in user_stats[u]:   # for each movie m that u rated
      if m in movie_stats:    # ignore movies which no training user rated
        (nf,nm) = movie_stats[m]
        # nf is #females and nm is #males in current movie
        fc += nf
        mc += nm
    return fc/(fc+mc)
In [7]:
def pred2(u, para=1):   # para is the prior
    """returns the average of the proportion of females of the movies that u rated""" 
    tot, cnt = (0,0)
    for m in user_stats[u]:
      if m in movie_stats:
        (nf,nm) = movie_stats[m]
        tot += (nf+para)/(nf+nm+2*para)
        cnt += 1
    return tot/cnt
In [8]:
print("Errors for pred 1",evaluate(pred1))
print("Errors for pred 2",evaluate(pred2))
Errors for pred 1 (36.30445120014031, 151.6369896169036)
Errors for pred 2 (35.72611555280531, 149.6073779890896)
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
def plotpc(which=1):   #which=0 is sse,  which=1 is logloss
    #plt.ion()  # make it interactive
    plt.xlabel("pseudocount")
    plt.ylabel(("Log Loss" if which else "sum-squares-error")+" (lower is better)")
    plt.xscale('linear')  # Makes a 'log' or 'linear' scale
    xvalues = [x/10 for x in range(100)]
    yps1,yps2,yts = [],[],[]
    for x in xvalues:
        eval1 = evaluate(pred1, para=x)
        yps1.append(eval1[which])
        eval2 = evaluate(pred2, para=x)
        yps2.append(eval2[which])
        evalt = evaluate(pred_ave, para=x)
        yts.append(evalt[which])
    plt.plot(xvalues, yps1, label="pred 1")
    plt.plot(xvalues, yps2, label="pred 2")
    plt.plot(xvalues, yts, label="tivial")  # use the doc string of the function
    plt.legend(loc="upper right")    # display the legend
    print("Done!")
    
plotpc()
Done!
In [10]:
plotpc(0) ## sum-squares-error
Done!

Taking into account the actual ratings

movie_rating_stats[m][r] gives a pair (#females, # males) who gave a rating of r for movie m

In [11]:
movie_rating_stats = {}  # movie, rating -> (#f, #m) dictionary
for (u,i,r,d) in ratings:
    if u in gender_train:
        if i not in movie_rating_stats:
            movie_rating_stats[i] = [(0,0) for i in range(6)]
        (nf, nm) = movie_rating_stats[i][r]
        if gender_train[u]=="F":
            movie_rating_stats[i][r] = (nf+1,nm)
        if gender_train[u]=="M":
            movie_rating_stats[i][r] = (nf,nm+1)

user_to_movie_and_rating[u] gives the list of (m,r) pairs such that u rated movie m with a rating of r

In [12]:
user_to_movie_and_rating = {} # user -> [(movie,rating)] dictionary
for (u,i,r,d) in ratings:
    if u not in user_to_movie_and_rating:
        user_to_movie_and_rating[u] = []
    user_to_movie_and_rating[u].append((i,r))
In [13]:
def pred3(u,para=1):
    """returns sum_m nf/(sum_m nf + nm) where sum is over the movies rated the same as u
   nf is the number of females and nm is the number of males in current movie"""
    fc,mc = para*nf_tr/tot_tr, para*(1-nf_tr/tot_tr)
    for m,r in user_to_movie_and_rating[u]:
      if m in movie_rating_stats:
        (nf,nm) = movie_rating_stats[m][r]
        fc += nf
        mc += nm
    return fc/(fc+mc)
In [14]:
def pred4(u, para=1):
    """returns the average of the proportion of females of the movies that rated same as u""" 
    tot, cnt = para*nf_tr/tot_tr, para
    for (m,r) in user_to_movie_and_rating[u]:
      if m in movie_rating_stats:
        (nf,nm) = movie_rating_stats[m][r]
        tot += (nf+para)/(nf+nm+2*para)
        cnt += 1
    return tot/cnt
In [15]:
print("Errors for pred 3",evaluate(pred3))
print("Errors for pred 4",evaluate(pred4))
Errors for pred 3 (35.86289062253914, 150.01235670521498)
Errors for pred 4 (34.50605554054473, 145.7071225203668)
In [16]:
def plotpc2(which=1):
    #plt.ion()  # make it interactive
    plt.xlabel("pseudocount")
    plt.ylabel("Log Loss (lower is better)")
    plt.xscale('linear')  # Makes a 'log' or 'linear' scale
    xvalues = [x/10 for x in range(1,100)]
    yps2,yps3,yps4 = [],[],[]
    for x in xvalues:
        eval2 = evaluate(pred2, para=x)
        yps2.append(eval2[which])
        eval3 = evaluate(pred3, para=x)
        yps3.append(eval3[which])
        eval4 = evaluate(pred4, para=x)
        yps4.append(eval4[which])
    plt.plot(xvalues, yps4, label="pred 4")
    plt.plot(xvalues, yps2, label="pred 2")
    plt.plot(xvalues, yps3, label="pred 3")
     
    plt.legend(loc="upper right")    # display the legend
    print("Done!")
    
plotpc2()
Done!
In [17]:
plotpc2(0) # sum-squares-error
Done!
In [ ]: