Path: blob/main/Lessons/Lesson 13 - RecSys 1/Chapter_Notebooks/Chapter2.ipynb
871 views
Kernel: Python 3 (system-wide)
In [3]:
Out[3]:
'1.5.3'
In [4]:
Out[4]:
In [5]:
Out[5]:
pandas.core.frame.DataFrame
In [6]:
Out[6]:
(5000, 24)
In [7]:
Out[7]:
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
'imdb_id', 'original_language', 'original_title', 'overview',
'popularity', 'poster_path', 'production_companies',
'production_countries', 'release_date', 'revenue', 'runtime',
'spoken_languages', 'status', 'tagline', 'title', 'video',
'vote_average', 'vote_count'],
dtype='object')
In [8]:
Out[8]:
adult False
belongs_to_collection NaN
budget 65000000
genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage NaN
id 8844
imdb_id tt0113497
original_language en
original_title Jumanji
overview When siblings Judy and Peter discover an encha...
popularity 17.015539
poster_path /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries [{'iso_3166_1': 'US', 'name': 'United States o...
release_date 12/15/95
revenue 262797249
runtime 104.0
spoken_languages [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
status Released
tagline Roll the dice and unleash the excitement!
title Jumanji
video False
vote_average 6.9
vote_count 2413
Name: 1, dtype: object
In [9]:
Out[9]:
adult False
belongs_to_collection NaN
budget 65000000
genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage NaN
id 8844
imdb_id tt0113497
original_language en
original_title Jumanji
overview When siblings Judy and Peter discover an encha...
popularity 17.015539
poster_path /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries [{'iso_3166_1': 'US', 'name': 'United States o...
release_date 12/15/95
revenue 262797249
runtime 104.0
spoken_languages [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
status Released
tagline Roll the dice and unleash the excitement!
video False
vote_average 6.9
vote_count 2413
Name: Jumanji, dtype: object
In [10]:
In [11]:
Out[11]:
In [12]:
Out[12]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 5000 non-null object
1 release_date 4996 non-null object
2 budget 5000 non-null int64
3 revenue 5000 non-null int64
4 runtime 4994 non-null float64
5 genres 5000 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 234.5+ KB
In [13]:
Out[13]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 5000 non-null object
1 release_date 4996 non-null object
2 budget 5000 non-null float64
3 revenue 5000 non-null int64
4 runtime 4994 non-null float64
5 genres 5000 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 234.5+ KB
/tmp/ipykernel_673/1765380320.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
small_df['budget'] = small_df['budget'].apply(to_float)
/tmp/ipykernel_673/1765380320.py:16: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
small_df['budget'] = small_df['budget'].astype('float')
In [14]:
Out[14]:
/tmp/ipykernel_673/2397457688.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
small_df['release_date'] = pd.to_datetime(small_df['release_date'], errors='coerce')
/tmp/ipykernel_673/2397457688.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
small_df['year'] = small_df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
In [15]:
Out[15]:
In [16]:
Out[16]:
In [17]:
Out[17]:
In [18]:
Out[18]:
In [18]:
Out[18]:
pandas.core.series.Series
In [19]:
Out[19]:
298.0
0.0
In [20]:
Out[20]:
10050354.6348
0.0
In [21]:
Out[21]:
85013183.00000004
In [22]:
Out[22]:
1996 358
1998 336
1995 332
1997 318
2000 309
...
2020 1
2021 1
2018 1
2010 1
2024 1
Name: year, Length: 90, dtype: int64