"""
@author: Roberto
"""
'''
Importar librerias
- numpy para operaciones de vectores y matrices
- pandas para manipular DataFrame (base de datos)
- Series para manipular columnas de DataFrame
'''
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import os
user = os.getlogin()
print(user)
os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Lab3")
netflix = pd.read_csv("../data/netflix_titles.csv")
netflix
netflix.info()
print( netflix.shape )
'''
Equivalent sum de Stata, solo la variable release_year es numerica
'''
netflix.describe()
print( netflix.show_id.unique() )
print( len( netflix.show_id.unique() ) )
print( netflix.show_id.is_unique )
print( len(netflix['show_id'].unique() ) )
netflix['show_id'].unique()
netflix['show_id'].is_unique
print(type(netflix['director']))
netflix['director']
print( netflix.director.unique() )
print( netflix['director'].isnull() )
print( netflix['director'].isna() )
netflix['director'].isna().sum()
netflix.dropna()
netflix.dropna(axis = 0)
netflix.dropna(axis = 1)
'''
- Notese que la base de datos Netflix no ha sido alterada. Para que la base de datos se altere debe ocurrir dos cosas:
1. Asignarse a una nueva base de datos
2. Si la función lo permite, incluir el input ***Inplace = True***
'''
netflix.dropna(subset = ['director'])
netflix.dropna(subset = ['director'])
netflix.director = netflix.director.fillna("Sin director")
netflix.director.fillna("Sin director", inplace = True)
netflix['new_col'] = np.arange(0, netflix.shape[0])
netflix.loc[0:100]
netflix.loc[210:500]
netflix.loc[210:]
netflix.loc[:,['show_id','type','description']]
netflix.loc[1000:2000,['show_id','type','description']]
netflix2 = netflix.set_index( [ 'show_id' ] )
netflix2.loc[['s1','s100','s7000'],:]
netflix2.reset_index()
'''
iloc usa las posiciones de filas y columnas
'''
netflix.iloc[0:100]
netflix.iloc[210:500]
netflix.iloc[:,[0,1,12]]
netflix.iloc[1000:2000,[0,1,12]]
list(netflix.columns)
net_old = netflix.sort_values("release_year")
net_new = netflix.sort_values("release_year", ascending = False)
netflix['number'] = np.random.randint(1, 10, netflix.shape[0])
netflix.sort_values(["release_year","number"])
net_two_sort = netflix.sort_values(["release_year","number"], ascending = [True,False])
netflix.director
netflix[["director","cast"]]
netflix[netflix["release_year"] < 2011].head(10)
netflix[(netflix["release_year"] < 2011) & (netflix["number"] > 5)]
netflix[( netflix.release_year < 2011 ) & (netflix.number > 5)]
netflix[['director']].iloc[0:10]
net_peru = netflix[netflix.country == "Peru"]
net_mex = netflix[netflix["country"] == "Mexico"]
netflix[(netflix["type"] == "TV Show") & (netflix["country"] == "Peru")]
len(netflix[(netflix["country"] == "Brazil") | (netflix["country"] == "Peru")])
netflix[(netflix["type"] == "TV Show") | (netflix["country"] == "Peru")]
netflix.loc[(netflix["type"] == "Movie") & (netflix["country"] == "United States")
& (netflix["release_year"] > 2019)]
net_per_ch = netflix[netflix["country"].isin(["Peru","Chile"])]
netflix[~netflix["country"].isin(["Peru","Chile"])]
"""
Alternative methods to filter
"""
movie = netflix["type"] == "Movie"
m_usa = netflix["country"] == "United States"
m_actual = netflix["release_year"] > 2019
movie_usa = netflix[movie & m_usa & m_actual]
movie_usa = netflix.loc[ ( netflix[ "type" ] == "Movie" )
& ( netflix[ "country" ] == "United States")
& ( netflix[ "release_year" ] > 2019 ) ]
movie_usa.drop(['show_id', 'director'], axis=1 )
movie_usa[movie_usa.release_year != 2020]
movie_usa.rename(columns = {'title':'Tituto_movie', 'duration':'Duration_movie'}, inplace = True)
movie_usa.to_csv("../movie_usa.csv")
movie_usa.to_excel("../movie_usa.xlsx")