CoCalc -- Chapter2.ipynb

GitHub Repository: DataScienceUWL/DS775
Path: blob/main/Lessons/Lesson 13 - RecSys 1/Chapter_Notebooks/Chapter2.ipynb
⁸⁷¹ views

Kernel: Python 3 (system-wide)

In [3]:

import pandas as pd
pd.__version__

Out[3]:

'1.5.3'

In [4]:

#Read the CSV File into df
# Note we have truncated the dataset to 5000 rows for illustration, the actual data has over 40000 rows
# the full dataset is available on Kaggle here
# https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/the-movies-dataset.zip/7
# the recommenders work better with more data of course

df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)

#We will find out what the following code does a little later!
df.head()

Out[4]:

In [5]:

#Output the type of df
type(df)

Out[5]:

pandas.core.frame.DataFrame

In [6]:

#Output the shape of df
df.shape

Out[6]:

(5000, 24)

In [7]:

#Output the columns of df
df.columns

Out[7]:

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:

#Select the second movie in df
second = df.iloc[1]
second

Out[8]:

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                       17.015539
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date                                                      12/15/95
revenue                                                          262797249
runtime                                                              104.0
spoken_languages         [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
status                                                            Released
tagline                          Roll the dice and unleash the excitement!
title                                                              Jumanji
video                                                                False
vote_average                                                           6.9
vote_count                                                            2413
Name: 1, dtype: object

In [9]:

#Change the index to the title
df = df.set_index('title')

#Access the movie with title 'Jumanji'
jum = df.loc['Jumanji']
jum

Out[9]:

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                       17.015539
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date                                                      12/15/95
revenue                                                          262797249
runtime                                                              104.0
spoken_languages         [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
status                                                            Released
tagline                          Roll the dice and unleash the excitement!
video                                                                False
vote_average                                                           6.9
vote_count                                                            2413
Name: Jumanji, dtype: object

In [10]:

df = df.reset_index()

In [11]:

#Create a smaller dataframe with a subset of all features
small_df = df[['title', 'release_date', 'budget', 'revenue', 'runtime', 'genres']]

#Output only the first 5 rows of small_df
small_df.head()

Out[11]:

In [12]:

#Get information of the data types of each feature
small_df.info()

Out[12]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         5000 non-null   object 
 1   release_date  4996 non-null   object 
 2   budget        5000 non-null   int64  
 3   revenue       5000 non-null   int64  
 4   runtime       4994 non-null   float64
 5   genres        5000 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 234.5+ KB

In [13]:

#Import the numpy library 
import numpy as np

#Function to convert to float manually
def to_float(x):
    try:
        x = float(x)
    except: 
        x = np.nan
    return x

#Apply the to_float function to all values in the budget column
small_df['budget'] = small_df['budget'].apply(to_float)

#Try converting to float using pandas astype
small_df['budget'] = small_df['budget'].astype('float')

#Get the data types for all features
small_df.info()

Out[13]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         5000 non-null   object 
 1   release_date  4996 non-null   object 
 2   budget        5000 non-null   float64
 3   revenue       5000 non-null   int64  
 4   runtime       4994 non-null   float64
 5   genres        5000 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 234.5+ KB

/tmp/ipykernel_673/1765380320.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['budget'] = small_df['budget'].apply(to_float)
/tmp/ipykernel_673/1765380320.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['budget'] = small_df['budget'].astype('float')

In [14]:

#Convert release_date into pandas datetime format
small_df['release_date'] = pd.to_datetime(small_df['release_date'], errors='coerce')

#Extract year from the datetime
small_df['year'] = small_df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

Out[14]:

/tmp/ipykernel_673/2397457688.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['release_date'] = pd.to_datetime(small_df['release_date'], errors='coerce')
/tmp/ipykernel_673/2397457688.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['year'] = small_df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [15]:

#Sort DataFrame based on release year
small_df = small_df.sort_values('year')

small_df.head()

Out[15]:

In [16]:

#Sort Movies based on revenue (in descending order)
small_df = small_df.sort_values('revenue', ascending=False)

small_df.head()

Out[16]:

In [17]:

#Select only those movies which earned more than 1 billion
new = small_df[small_df['revenue'] > 1e9]

new

Out[17]:

In [18]:

#Select only those movies which earned more than 1 billion and spent less than 150 million
new2 = small_df[(small_df['revenue'] > 1e9) & (small_df['budget'] < 1.5e8)]
new2

Out[18]:

In [18]:

type(small_df['year'])

Out[18]:

pandas.core.series.Series

In [19]:

#Get the runtime Series object
runtime = small_df['runtime']

#Print the longest runtime of any movie
print(runtime.max())

#Print the shortest runtime of any movie
print(runtime.min())

Out[19]:

298.0
0.0

In [20]:

#Get the budget Series object
budget = small_df['budget']

#Print the mean budget of the movies
print(budget.mean())

#Print the median budget of the movies
print(budget.median())

Out[20]:

10050354.6348
0.0

In [21]:

#Get the revenue Series object
revenue = small_df['revenue']

#Revenue generated by the 90th percentile movie
revenue.quantile(0.90)

Out[21]:

85013183.00000004

In [22]:

#Get number of movies released each year
small_df['year'].value_counts()

Out[22]:

  358
  336
  332
  318
  309
       ... 
    1
    1
    1
    1
    1
Name: year, Length: 90, dtype: int64

Product

Resources

Company