Path: blob/main/Trabajo_grupal/WG6/Grupo7_py.ipynb
2714 views
Kernel: Python 3 (ipykernel)
In [1]:
import os # for usernanme y set direcotrio import pandas as pd import numpy as np import weightedcalcs as wc # ponderador o factor de expansion from tqdm import tqdm # controlar el tiempo en un loop () import warnings warnings.filterwarnings('ignore')
1. Salario por hora del trabajador dependiente (Python y R)
2019
In [65]:
# Cargando bases de datos 2019 enaho01 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta") enaho34 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta") print(enaho01.shape) print(enaho34.shape)
Out[65]:
(43868, 323)
(34565, 158)
In [66]:
# identificador por hogar: conglome, vivienda, hogar merge_base_2019 = pd.merge(enaho01, enaho34, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "m:1", suffixes=('', '_y'))
In [67]:
## drop variables que tenrminan en _y index_columns = np.where( merge_base_2019.columns.str.contains('_y$', regex=True))[0] merge_base_2019.drop(merge_base_2019.columns[index_columns], axis = 1, inplace = True)
2020
In [68]:
# Cargando bases de datos 2020 enaho01 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta") enaho34 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta")
In [69]:
# identificador por hogar: conglome, vivienda, hogar merge_base_2020 = pd.merge(enaho01, enaho34, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "m:1", suffixes=('', '_y'))
In [70]:
## drop variables que tenrminan en _y index_columns = np.where( merge_base_2020.columns.str.contains('_y$', regex=True))[0] merge_base_2020.drop(merge_base_2020.columns[index_columns], axis = 1, inplace = True)
juntando bases 2019 y 2020
In [71]:
# Juntando bases del 2019 y 2020 base_2019_2020 = merge_base_2019.append(merge_base_2020, ignore_index= True) # ignore_index=True, para evitar conflictos de index base_2019_2020
Out[71]:
In [72]:
# extrae 2 primeros digitos base_2019_2020['ubigeo_dep'] = base_2019_2020['ubigeo'].str[:2]
In [73]:
# convirtiendo variables a int para futuro merge base_2019_2020[['aÑo', 'ubigeo_dep']] = base_2019_2020[['aÑo', 'ubigeo_dep']].astype(int)
Deflactor temporal
In [74]:
### Deflactor temporal deflactores_base2020_new = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta") deflactores_base2020_new = deflactores_base2020_new[ ["dpto", "aniorec", "i00"] ] deflactores_base2020_new.head()
Out[74]:
In [75]:
# identificador por hogar: conglome, vivienda, hogar base_2019_2020 = pd.merge(base_2019_2020, deflactores_base2020_new, left_on = ["ubigeo_dep", "aÑo"], right_on = ["dpto", "aniorec"], how = "left", validate = "m:1")
In [76]:
# dividiento variables de ingreso y gasto por mieperho, 12, ld e i00 base_2019_2020['ingreso_deflact'] = base_2019_2020['inghog1d'] / (12 * base_2019_2020['mieperho'] * base_2019_2020['ld'] * base_2019_2020['i00']) base_2019_2020['gasto_deflact'] = base_2019_2020['gashog2d'] / (12 * base_2019_2020['mieperho'] * base_2019_2020['ld'] * base_2019_2020['i00'])
In [78]:
# ordenando datos para poder compararlos con los resultados de R base_2019_2020 = base_2019_2020.sort_values(by=['conglome', 'vivienda', 'hogar' ], axis=0, ascending=True, ignore_index=True) base_2019_2020
Out[78]:
2. Salario por hora del trabajador dependiente (Python y R)
In [79]:
# Cargando modulo 05 enaho05 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo05/737-Modulo05/enaho01a-2020-500.dta") enaho05 = enaho05[ ["i524e1", "i538e1", "i513t", "i518"] ]
In [80]:
# reemplazando NA por ceros enaho05 = enaho05.fillna(0) enaho05
Out[80]:
In [94]:
# generando variables necesarias enaho05['ingreso'] = enaho05['i524e1'] + enaho05['i538e1'] enaho05['horas'] = enaho05['i513t'] + enaho05['i518'] # reemplazando NA por 0 enaho05['ingreso'] = enaho05['ingreso'].replace([''], 0) enaho05['horas'] = enaho05['horas'].replace([''], 0) # salario por hora enaho05['sal_hora_depend'] = enaho05['ingreso'] / (enaho05['horas'] * 52) # Si un salario por hora resulta 0, convertir a missing. enaho05['sal_hora_depend'] = enaho05['sal_hora_depend'].replace([0], np.nan) # visualizando variable calculada enaho05[['sal_hora_depend']]
Out[94]:
3. Groupby (Python y R)
In [95]:
enaho02 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta") enaho34 = pd.read_stata("C:/Users/Jose Pastor/Documents/datos_documents/enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta")
In [96]:
# identificando hogares con algun miembro mayor a 65 años enaho02_1 = enaho02.groupby( [ "conglome", "vivienda", "hogar" ], as_index = False ).agg( edad_max = ( 'p208a', max )) enaho02_1
Out[96]:
In [97]:
# inner merge base_final_2 = pd.merge(enaho02_1, enaho34, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "m:1")
In [98]:
# Generamos variable "pobre" mediante comparación entre gasto y linea de pobreza base_final_2['gasto_month'] = base_final_2['gashog2d'] / (12*base_final_2['mieperho']) base_final_2["pobre"] = np.where(base_final_2["gasto_month"] < base_final_2["linea"], "pobre", "no pobre")
In [99]:
base_final_2
Out[99]:
Dummy mas de 65 años y pobre
In [100]:
# La estrategia es generara dummies tanto para pobreza y mayor a 65 años # Luego se sumaran las dummies y aquellas que resulten 2 es porque cumple la condicion. # Finalmente, se convertira el 2 en 1 para tener una dummy que cumpla con las condiciones. base_final_2["dummy_pobre"] = np.where(base_final_2["pobre"] == 'pobre', 1, 0) base_final_2["dummy_mayor65"] = np.where(base_final_2["edad_max"] > 65, 1, 0) # sumando dummies base_final_2['dummy_suma'] = base_final_2["dummy_pobre"] + base_final_2["dummy_mayor65"] # reemplazando 2 por 1 base_final_2['dummy_pobre_mayor65'] = np.where(base_final_2['dummy_suma'] == 2, 1, 0)
In [101]:
base_final_2
Out[101]: