Path: blob/main/Trabajo_grupal/WG6/Grupo_4_py.py
2714 views
# -*- coding: utf-8 -*-1"""2###############################################################################3# #4# WG#6 - Grupo 4 #5# #6###############################################################################78Integrantes:910Luana Morales11Seidy Ascensios12Marcela Quintero13Flavia Oré1415"""1617#%% Pregunta 1 y 2 (Merge dataset Y Salario por hora del trabajador dependiente)181920!pip install weightedcalcs21import os # for usernanme y set direcotrio22import pandas as pd23import numpy as np24import weightedcalcs as wc # ponderador25from tqdm import tqdm # controlar el tiempo en un loop2627"1) Set Directorio"2829user = os.getlogin() # Username303132os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Trabajo_grupal/WG6") # Set directorio33343536"ENAHO 2020"3738"2) Load dataset de ENAHO"39404142enaho_2020 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta")434445enaho01 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",46convert_categoricals=False)474849labels01 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",50convert_categoricals=False, iterator=True)5152535455enaho34 = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",56convert_categoricals=False)575859labels34 = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",60convert_categoricals=False, iterator=True)6162636465"4) Merge section"6667"Left merge"6869#enaho34: master data70#enaho01: using data7172enaho_merge_2020 = pd.merge(enaho34, enaho01,73on = ["conglome", "vivienda", "hogar"],74how = "left",75validate = "m:1")76suffixes=('', '_y'),77)7879index_columns = np.where( merge_base_2020.columns.str.contains('_y$', regex=True))[0]8081merge_base_2020.drop(merge_base_2020.columns[index_columns], axis = 1, inplace = True)828384"ENAHO 2019"8586enaho_2019 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta")878889enaho01_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta",90convert_categoricals=False)919293labels01_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta",94convert_categoricals=False, iterator=True)9596979899enaho34_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo34/737-Modulo34/sumaria-2019.dta",100convert_categoricals=False)101102103labels34_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo34/737-Modulo34/sumaria-2019.dta",104convert_categoricals=False, iterator=True)105106"4) Merge section"107108"Left merge"109110#enaho34_1: master data111#enaho01_1: using data112113enaho_merge_2019 = pd.merge(enaho34_1, enaho01_1,114on = ["conglome", "vivienda", "hogar"],115how = "left",116validate = "m:1")117suffixes=('', '_y'),118)119120index_columns = np.where( merge_base_2019.columns.str.contains('_y$', regex=True))[0]121122merge_base_2019.drop(merge_base_2019.columns[index_columns], axis = 1, inplace = True)123124125# Append126127128merge_append = merge_base_2020.append(merge_base_2019, ignore_index = True)129130merge_append['dpto'] = merge_append['ubigeo'].str[:2]131132#ignore_index= True : no haya conflictos de indexing133134merge_append.to_stata("append_enaho.dta", write_index = False)135136# Deflactar137138deflactores_base2020_new = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta",139convert_categoricals=False)140141142"4) Merge section deflactores"143144145# merge usando como llaves a las variables dpto y aniorec.146#merge_append: master data147#deflactores_base2020_new: using data148149merge_append_deflac = pd.merge(merge_append, deflactores_base2020_new,150on = ["dpto", "aNo"],151how = "left",152validate = "m:1")153154155merge_append_deflac["ingreso_month"] = merge_append_deflac["inghog1d"]/(12*merge_append_deflac["mieperho"]*merge_append_deflac[ld]*merge_append_deflac[i00])156157merge_append_deflac["gasto_month"] = merge_append_deflac["gashog2d"]/(12*merge_append_deflac["mieperho"]*merge_append_deflac[ld]*merge_append_deflac[i00])158159#%% Pregunta 3 - Group by160161#Importamos los programas necesarios162163import pandas as pd164from pandas import DataFrame,Series165import numpy as np166import weightedcalcs as wc167from tqdm import tqdm168import re169import os170171# Seteamos el directorio172173os.chdir(f"C:/Users/Marcela Quintero/Documents/GitHub/1ECO35_2022_2/Lab7")174175#Leemos la base de datos176177enaho_2 = pd.read_stata(r"../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta", convert_categoricals=False)178179#Vemos la base de datos180181enaho_2182183184#Vemos los labels antes de realizar el groupby185186labels2 = pd.read_stata(r"../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta",187convert_categoricals=False, iterator=True)188189labels2.variable_labels()190191labels2.value_labels().keys()192193enaho_2.keys()194195196#Utilizamos group by para seleccionar la mayor edad de cada hogar197198199hogares = enaho_2.groupby(['conglome','vivienda','hogar'],as_index=False).p208a.max()200201202print(hogares)203204205#Hacemos un merge con el módulo 34 para obtener los datos que nos faltan (pobreza)206207#Para ello primero cargamos la base de datos (módulo 34) y obtenemos sus labels208209enaho34 = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",210convert_categoricals=False)211212labels34 = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",213convert_categoricals=False, iterator=True)214215labels34.variable_labels()216217labels34.value_labels().keys()218219enaho34.keys()220221222#Ahora filtramos la base con groupby para quedarnos solo con lo que necesitamos (el estado de pobreza)223224hogares34 = enaho34.groupby(['conglome','vivienda','hogar'],as_index=False).pobreza.sum()225226print (hogares34)227228229#Ahora, procedemos a realizar el merge entre hogares y hogares 34230231enaho_merge = pd.merge(hogares, hogares34,232on = ["conglome", "vivienda", "hogar"],233how = "inner",234validate = "1:1")235236print (enaho_merge)237238239#Procedemos a crear la dummy que verifica si el hogar es pobre y cuenta con algún miembro del hogar mayor a 65 años.240241enaho_merge['dummy_pension'] = (enaho_merge['p208a'] >= 65) & (enaho_merge['pobreza'] < 3)242243244#VIsualizamos la dummy que es true si se cumplen ambas condiciones y false si no se cumplen245246print(enaho_merge['dummy_pension'])247248249250251252253254255256257258