Path: blob/main/Trabajo_grupal/WG6/Solución/solucion_py.py
2835 views
# -*- coding: utf-8 -*-1"""2@author: Roberto3"""45import pandas as pd6import numpy as np7import os89user = os.getlogin() # Username101112os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2") # Set directorio131415#%% Load datasets1617enaho_19_01 = pd.read_stata(r"../../datos/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta",18convert_categoricals=False)192021enaho_19_34 = pd.read_stata(r"../../datos/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta",22convert_categoricals=False)2324deflactor = pd.read_stata(25r"../../datos/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta",26convert_categoricals=False)272829###### Año 2019 ######3031enaho_19_01.rename(columns={'aÑo':'year'}, inplace=True)3233# seleccionamos variables del módulo 13435enaho_19_01 = enaho_19_01[['year','conglome','vivienda','hogar','ubigeo']]363738# seleccionamos variables del módulo sumaria394041enaho_19_34 = enaho_19_34[['conglome','vivienda','hogar','mieperho','inghog1d','gashog2d','ld']]424344# Merge ambas bases4546enaho_merge_19 = pd.merge(enaho_19_01, enaho_19_34,47on = ["conglome", "vivienda", "hogar"],48how = "left")4950###### Año 2020 ######5152enaho_20_01 = pd.read_stata(r"../../datos/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",53convert_categoricals=False)545556enaho_20_34 = pd.read_stata(r"../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",57convert_categoricals=False)5859enaho_20_01.rename(columns={'aÑo':'year'}, inplace=True)6061# seleccionamos variables del módulo 16263enaho_20_01 = enaho_20_01[['year','conglome','vivienda','hogar','ubigeo']]646566# seleccionamos variables del módulo sumaria676869enaho_20_34 = enaho_20_34[['conglome','vivienda','hogar','mieperho','inghog1d','gashog2d','ld']]707172# Merge ambas bases7374enaho_merge_20 = pd.merge(enaho_20_01, enaho_20_34,75on = ["conglome", "vivienda", "hogar"],76how = "left")777879##### Append #####8081enaho_append = enaho_merge_19.append(enaho_merge_20, ignore_index = True)828384## merge con la abse de deflactores858687# seleccionamos dos primeros digitos y lo comvertimos a numero pues la variable ubigeo es string (texto)8889enaho_append['ubigeo_dep'] = enaho_append['ubigeo'].str[:2].astype(int)9091enaho_append['year'] # es un string9293# para hacer el merge, la variables deben comaprtir el mismo formato94# la variable departamento de la base deflactor debe ser numero entero95# mientras la variable year debe ser un string969798deflactor['dpto'] = deflactor['dpto'].astype(int)99100deflactor['aniorec'] = deflactor['aniorec'].astype(int).astype(str)101# notese que aniorec es float, luego se onvierte a entero y finalmente a string para eliminar decimal (.0)102103104deflactor.info() # verifica los formatos de las variables105106107enaho_append = pd.merge(enaho_append,deflactor, left_on =['year','ubigeo_dep'] ,108right_on =['aniorec','dpto'] , how = "left", validate = "m:1")109110111# Creación de variables112113114enaho_append['ing_pc_real'] = enaho_append['inghog1d']/(12*enaho_append['ld']*enaho_append['i00']*enaho_append['mieperho'])115116117enaho_append['gast_pc_real'] = enaho_append['gashog2d']/(12*enaho_append['ld']*enaho_append['i00']*enaho_append['mieperho'])118119120#%% Salario por hora (pregunta 2)121122123enaho_20_05 = pd.read_stata(r"../../datos/2020/737-Modulo05/737-Modulo05/enaho01a-2020-500.dta",124convert_categoricals=False)125126127128# sum() permite sumar columnas ignorando los missings129130enaho_20_05['suma_ingreso'] = enaho_20_05[["i524e1", "i538e1"]].sum(axis=1)131132# axis 1: suma horizontal o por fila133134enaho_20_05['total_horas'] = enaho_20_05[["i513t", "i518"]].sum(axis=1)135136137enaho_20_05['hour_wage'] = enaho_20_05['suma_ingreso']/(enaho_20_05['total_horas']*52)138139# Si alguno salario por hora es cero, se reemplaza por missing140141enaho_20_05['hour_wage'].replace(0, np.nan, inplace = True)142143144#%% Group-by145146enaho_20_02 = pd.read_stata(r"../../datos/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta",147convert_categoricals=False)148149enaho_20_34 = pd.read_stata(r"../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",150convert_categoricals=False)151152# nanmax ignora missings.153154df1 = enaho_20_02.groupby( [ "conglome", "vivienda", "hogar" ],155as_index = False ).agg( edad_max = ( 'p208a', np.nanmax))156157158159df2 = enaho_20_02.groupby( [ "conglome", "vivienda", "hogar" ],160as_index = False ).agg( edad_max = ( 'p208a', np.max ))161162163# Los resultados son iguales pues la columna edad no presenta missings164165# Si queremos que la edad maxima del hogar sea dato de cada miembro del hogar166167enaho_20_02["edad_maxima_hogar"] = enaho_20_02.groupby(168[ "conglome", "vivienda", "hogar"])['p208a'].transform(np.nanmax)169170# Merge Sumaria (modulo 34)171172173enaho_pension = pd.merge(df2,enaho_20_34, on =["conglome", "vivienda", "hogar"],174how = "left", validate = "1:1")175176# Usamos numpy where con la condición de maxima edad mayor a 65 y hogar pobre (1: pobre extremo, 2 :pobre)177178enaho_pension['hogar_benf_pen'] = np.where(179(enaho_pension['edad_max'] >=65) & (enaho_pension['pobreza'].isin([1,2]))180, 1,0)181182# Reemplazamos missing si edad_max o pobreza tiene missings183184# enaho_pension[['edad_max','pobreza']].isnull().any(axis=1) si alguna variable es missing185# entonces reemplaza missing np.nan en la observación correspondiente186187enaho_pension['hogar_benf_pen'].mask(188enaho_pension[['edad_max','pobreza']].isnull().any(axis=1), np.nan, inplace=True)189190# Observamos el total de hogarares a focalizar por el programa Pension 65191192enaho_pension['hogar_benf_pen'].value_counts()193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226