Read data from the ratings.csv file
ratings <- read_csv('ratings.csv',
col_names = c('user_id','movie_id','rating','timestamp'))
## Parsed with column specification:
## cols(
## user_id = col_double(),
## movie_id = col_double(),
## rating = col_double(),
## timestamp = col_double()
## )
Loaded 305.2 Mb of ratings data, containing 10,000,054 ratings. Here’s a preview:
head(ratings) %>% kable()
| user_id | movie_id | rating | timestamp |
|---|---|---|---|
| 1 | 122 | 5 | 838985046 |
| 1 | 185 | 5 | 838983525 |
| 1 | 231 | 5 | 838983392 |
| 1 | 292 | 5 | 838983421 |
| 1 | 316 | 5 | 838983392 |
| 1 | 329 | 5 | 838983392 |
# plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26
ratings %>% ggplot() + geom_histogram(aes(x=rating),binwidth = 0.5) + scale_y_continuous(label=comma)
# aggregate ratings by movie, computing mean and number of ratings
# hint: use the n() function for easy counting within a group
aggregate_by_movie <- ratings %>% group_by(movie_id) %>% summarize(count = n(),mean_r=mean(rating))
# plot distribution of movie popularity (= number of ratings the movie received)
# hint: try scale_x_log10() for a logarithmic x axis
aggregate_by_movie %>% ggplot() + geom_histogram(aes(x=count)) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28
# hint: try geom_histogram and geom_density
aggregate_by_movie %>% ggplot() + geom_histogram(aes(x=mean_r)) + scale_x_continuous()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
aggregate_by_movie %>% ggplot() + geom_density(aes(x=mean_r),fill="red") + scale_x_continuous()
# rank movies by popularity and compute the cdf, or fraction of movies covered by the top-k moves https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=30
# hint: use dplyr's rank and arrange functions, and the base R sum and cumsum functions
cdf <- aggregate_by_movie %>% arrange(desc(count)) %>% mutate(sum_c = sum(count), cumsum_count = cumsum(count),per = cumsum_count/sum_c * 100,rank=rank(desc(count)))
# store the result in a new data frame so you can use it in creating figure 2 from the paper below
cdf %>% ggplot() + geom_line(aes(x=rank,y=per))
# plot the CDF of movie popularity
# aggregate ratings by user, computing mean and number of ratings
user_grp <- ratings %>% group_by(user_id) %>% summarize(count=n(),mean=mean(rating))
# plot distribution of user activity (= number of ratings the user made)
# hint: try a log scale here
user_grp %>% ggplot() + geom_histogram(aes(x=count)) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# generate the equivalent of figure 2 of this paper:
# https://5harad.com/papers/long_tail.pdf
# Specifically, for the subset of users who rated at least 10 movies,
# produce a plot that shows the fraction of users satisfied (vertical
# axis) as a function of inventory size (horizontal axis). We will
# define "satisfied" as follows: an individual user is satisfied p% of
# the time at inventory of size k if at least p% of the movies they
# rated are contained in the top k most popular movies. As in the
# paper, produce one curve for the 100% user satisfaction level and
# another for 90%---do not, however, bother implementing the null
# model (shown in the dashed lines).
aggregate_by_movie <- ratings %>% group_by(movie_id) %>% summarize(count=n()) %>% arrange(desc(count)) %>% mutate(rank=row_number()) %>% select(movie_id,rank)
user_grp <- ratings %>% select(user_id,movie_id)
user_ratings <- inner_join(ratings,aggregate_by_movie,by="movie_id")
user_100 <- user_ratings %>% group_by(user_id) %>% summarise(max_r=max(rank)) %>% arrange(max_r) %>% select(user_id,max_r) %>% group_by(max_r) %>% summarize(count=n()) %>% mutate(cdf_100=cumsum(count)/sum(count))
user_90 <- user_ratings %>% group_by(user_id) %>% summarise(rank_90=quantile(rank,0.90)) %>% arrange(rank_90) %>% select(user_id,rank_90) %>% group_by(rank_90) %>% summarize(count=n()) %>% mutate(cdf_90=cumsum(count)/sum(count))
ggplot() + geom_line(aes(y=user_100$cdf_100,x=user_100$max_r)) + geom_line(aes(y=user_90$cdf_90,x=user_90$rank_90))