1 Load and preview data

Read data from the ratings.csv file

ratings <- read_csv('ratings.csv',
                    col_names = c('user_id','movie_id','rating','timestamp'))
## Parsed with column specification:
## cols(
##   user_id = col_double(),
##   movie_id = col_double(),
##   rating = col_double(),
##   timestamp = col_double()
## )

Loaded 305.2 Mb of ratings data, containing 10,000,054 ratings. Here’s a preview:

head(ratings) %>% kable()
user_id movie_id rating timestamp
1 122 5 838985046
1 185 5 838983525
1 231 5 838983392
1 292 5 838983421
1 316 5 838983392
1 329 5 838983392

2 Summary statistics

# plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26
ratings %>% ggplot() + geom_histogram(aes(x=rating),binwidth = 0.5) + scale_y_continuous(label=comma)

2.1 Per-movie stats

# aggregate ratings by movie, computing mean and number of ratings
# hint: use the n() function for easy counting within a group
aggregate_by_movie <- ratings %>% group_by(movie_id) %>% summarize(count = n(),mean_r=mean(rating))
# plot distribution of movie popularity (= number of ratings the movie received)
# hint: try scale_x_log10() for a logarithmic x axis
aggregate_by_movie %>% ggplot() + geom_histogram(aes(x=count)) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28
# hint: try geom_histogram and geom_density
aggregate_by_movie %>% ggplot() + geom_histogram(aes(x=mean_r)) + scale_x_continuous()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

aggregate_by_movie %>% ggplot() + geom_density(aes(x=mean_r),fill="red") + scale_x_continuous()

# rank movies by popularity and compute the cdf, or fraction of movies covered by the top-k moves https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=30
# hint: use dplyr's rank and arrange functions, and the base R sum and cumsum functions
cdf <- aggregate_by_movie %>% arrange(desc(count)) %>% mutate(sum_c = sum(count), cumsum_count = cumsum(count),per = cumsum_count/sum_c * 100,rank=rank(desc(count)))
# store the result in a new data frame so you can use it in creating figure 2 from the paper below
cdf %>% ggplot() + geom_line(aes(x=rank,y=per))

# plot the CDF of movie popularity

3 Per-user stats

# aggregate ratings by user, computing mean and number of ratings
user_grp <- ratings %>% group_by(user_id) %>% summarize(count=n(),mean=mean(rating))
# plot distribution of user activity (= number of ratings the user made)
# hint: try a log scale here
user_grp %>% ggplot() + geom_histogram(aes(x=count)) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4 Anatomy of the long tail

# generate the equivalent of figure 2 of this paper:
# https://5harad.com/papers/long_tail.pdf

# Specifically, for the subset of users who rated at least 10 movies,
# produce a plot that shows the fraction of users satisfied (vertical
# axis) as a function of inventory size (horizontal axis). We will
# define "satisfied" as follows: an individual user is satisfied p% of
# the time at inventory of size k if at least p% of the movies they
# rated are contained in the top k most popular movies. As in the
# paper, produce one curve for the 100% user satisfaction level and
# another for 90%---do not, however, bother implementing the null
# model (shown in the dashed lines).
aggregate_by_movie <- ratings %>% group_by(movie_id) %>% summarize(count=n()) %>% arrange(desc(count)) %>% mutate(rank=row_number()) %>% select(movie_id,rank)
user_grp <- ratings %>% select(user_id,movie_id)
user_ratings <- inner_join(ratings,aggregate_by_movie,by="movie_id")

user_100 <- user_ratings %>% group_by(user_id) %>% summarise(max_r=max(rank)) %>% arrange(max_r) %>% select(user_id,max_r) %>% group_by(max_r) %>% summarize(count=n()) %>% mutate(cdf_100=cumsum(count)/sum(count))

user_90 <- user_ratings %>% group_by(user_id) %>% summarise(rank_90=quantile(rank,0.90)) %>% arrange(rank_90) %>% select(user_id,rank_90) %>% group_by(rank_90) %>% summarize(count=n()) %>% mutate(cdf_90=cumsum(count)/sum(count))

ggplot() + geom_line(aes(y=user_100$cdf_100,x=user_100$max_r)) + geom_line(aes(y=user_90$cdf_90,x=user_90$rank_90))