importpandasaspdimportnumpyasnp#Read the CSV File into df# Note we have truncated the dataset to 5000 rows for illustration, the actual data has over 40000 rows# the full dataset is available on Kaggle here# https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/the-movies-dataset.zip/7# the recommenders work better with more data of coursedf=pd.read_csv('./data/movies_metadata.csv')df.head()
Out[1]:
In [2]:
#Calculate the number of votes garnered by the 80th percentile moviem=df['vote_count'].quantile(0.80)m
Out[2]:
255.20000000000027
In [3]:
#Only consider movies longer than 45 minutes and shorter than 300 minutesq_movies=df[(df['runtime']>=45)&(df['runtime']<=300)]#Only consider movies that have garnered more than m votesq_movies=q_movies[q_movies['vote_count']>=m]#Inspect the number of movies that made the cutq_movies.shape
Out[3]:
(999, 24)
In [4]:
# Calculate CC=df['vote_average'].mean()C
Out[4]:
6.06916
In [5]:
# Function to compute the IMDB weighted rating for each moviedefweighted_rating(x,m,C):v=x['vote_count']R=x['vote_average']# Compute the weighted scorereturn(v/(v+m)*R)+(m/(m+v)*C)
In [7]:
# Compute the score using the weighted_rating function defined aboveq_movies['score']=q_movies.apply(weighted_rating,args=(m,C),axis=1)
In [8]:
#Sort movies in descending order of their scoresq_movies=q_movies.sort_values('score',ascending=False)#Print the top 25 moviesq_movies[['title','vote_count','vote_average','score','runtime']].head(25)