Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Trabajo_grupal/WG6/Grupo_8_py.ipynb
2714 views
Kernel: Python 3 (ipykernel)

1) Merge Dataset

import os # for usernanme y set direcotrio import pandas as pd import numpy as np import weightedcalcs as wc # ponderador from tqdm import tqdm # controlar el tiempo en un loop
#Directorio user = os.getlogin() # Username os.chdir(f"C:/Users/{user}/Documents/GitHub/Scripts_SabinaOlivera_r_py_jl/tareas") #directorio
############# Merge 2020 ################## #Modulo 1 (Caracteristicas de vivienda y hogar) # Colocamos convert_categoricals=False, pues esto por deafult es True y queremos que se respete los value's label enaho01_2020 = pd.read_stata(r"../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta", convert_categoricals=False) labels01 = pd.read_stata(r"../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta", convert_categoricals=False, iterator=True) labels01.variable_labels() labels01.value_labels().keys() # Modulo 34 (sumarias) enaho34_2020 = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta", convert_categoricals=False) #Realizamos merge del modulo 1 y 34 del año 2020 #identificador del hogar: conglome, vivienda, hogar merge2020= pd.merge(enaho01_2020, enaho34_2020, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "1:1")
dict_keys(['dominio', 'estrato', 'tipenc', 'result', 'panel', 'p22', 'p24a', 'p24b', 'p25_1', 'p25_2', 'p25_3', 'p25_4', 'p25_5', 'p101', 'p102', 'p103', 'p103a', 'p104b1', 'p104b2', 'p105a', 'p106a', 'p106b', 'p107b1', 'p107c11', 'p107c12', 'p107c13', 'p107c14', 'p107c16', 'p107c17', 'p107c18', 'p107c19', 'p107c110', 'p107b2', 'p107c21', 'p107c22', 'p107c23', 'p107c24', 'p107c26', 'p107c27', 'p107c28', 'p107c29', 'p107c210', 'p107b3', 'p107c31', 'p107c32', 'p107c33', 'p107c34', 'p107c36', 'p107c37', 'p107c38', 'p107c39', 'p107c310', 'p107b4', 'p107c41', 'p107c42', 'p107c43', 'p107c44', 'p107c46', 'p107c47', 'p107c48', 'p107c49', 'p107c410', 'p107e', 'p110', 'p110a1', 'p110c', 'p110d', 'p110e', 'p110f', 'p110g', 'p111a', 'p1121', 'p1123', 'p1124', 'p1125', 'p1126', 'p1127', 'p112a', 'p1131', 'p1132', 'p1133', 'p1135', 'p1136', 'p1139', 'p1137', 'p1138', 'p113a', 'p1141', 'p1142', 'p1143', 'p1144', 'p1145', 'p1171_01', 'p1171_02', 'p1171_03', 'p1171_04', 'p1171_05', 'p1171_06', 'p1171_07', 'p1171_08', 'p1171_09', 'p1171_10', 'p1171_11', 'p1171_12', 'p1171_13', 'p1171_14', 'p1171_15', 'p1171_16', 'p1175_01', 'p1175_02', 'p1175_03', 'p1175_04', 'p1175_05', 'p1175_06', 'p1175_07', 'p1175_08', 'p1175_09', 'p1175_10', 'p1175_11', 'p1175_12', 'p1175_13', 'p1175_14', 'p1175_15', 'p1175_16', 'p612i1', 'p612i2', 't110', 't111a', 'nbi1', 'nbi2', 'nbi3', 'nbi4', 'nbi5', 'ticuest01', 'tipocuestionario', 'tipoentrevista', 'rechazo_razones'])
############# Merge 2019 ################## enaho01_2019 = pd.read_stata(r"../../../enaho/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta", convert_categoricals=False) enaho34_2019 = pd.read_stata(r"../../../enaho/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta", convert_categoricals=False) merge2019 = pd.merge(enaho01_2019, enaho34_2019, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "1:1")
########### Append 2019 y 2020 ############ merge_append = merge2020.append(merge2019, ignore_index = True) #ignore_index= True para que no haya conflictos de indexing merge_append.to_stata("../../../append_enaho.dta", write_index = False) # usamos write_index=False para no guardar con una columan de index
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\3416170526.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. merge_append = merge2020.append(merge2019, ignore_index = True) #ignore_index= True para que no haya conflictos de indexing C:\Users\SABINA\anaconda3\lib\site-packages\pandas\io\stata.py:2491: InvalidColumnName: Not all pandas column names were valid Stata variable names. The following replacements have been made: aÑo_x -> a_o_x aÑo_y -> a_o_y If this is not what you expect, please make sure you have Stata-compliant column names in your DataFrame (strings only, max 32 characters, only alphanumerics and underscores, no Stata reserved words) warnings.warn(ws, InvalidColumnName)
###### Creamos ingreso y gasto mensual ##### merge_append["ingreso_mensual"] = merge_append["inghog1d"]/(12*merge_append["mieperho"]) merge_append["gasto_mensual"] = merge_append["gashog2d"]/(12*merge_append["mieperho"])
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\2700305781.py:3: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` merge_append["ingreso_mensual"] = merge_append["inghog1d"]/(12*merge_append["mieperho"]) C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\2700305781.py:5: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` merge_append["gasto_mensual"] = merge_append["gashog2d"]/(12*merge_append["mieperho"])
########## Deflactando las variables ########### # El deflactor espacial es ld. merge_append["ingreso_mensual_defl"] = merge_append["ingreso_mensual"] * merge_append["ld"] merge_append["gasto_mensual"] = merge_append["gasto_mensual"]* merge_append["ld"] #Creamos la variable "departamento" a partir del ubigeo para luego aplicar el merge con deflactores_2020 merge_append['departamento'] = merge_append['ubigeo_x'].str[:2] merge_append['departamento6'] = merge_append['ubigeo_x'].str[:2]+"0000" merge_append = merge_append[merge_append.departamento.isin(["15","03","04"])] deflactores_base2020_new = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta", convert_categoricals=False)
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:5: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` merge_append["ingreso_mensual_defl"] = merge_append["ingreso_mensual"] * merge_append["ld"] C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:12: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` merge_append['departamento'] = merge_append['ubigeo_x'].str[:2] C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:13: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` merge_append['departamento6'] = merge_append['ubigeo_x'].str[:2]+"0000"
#Merge entre merge_append y el deflactor temporal: merge_def_temporal_2020 = pd.merge(merge_append, deflactores_base2020_new , left_on = ["departamento", "aÑo_x"], right_on = ["dpto","aniorec"], how = "left", validate = "1:1")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [9], in <cell line: 3>() 1 #Merge entre merge_append y el deflactor temporal: ----> 3 merge_def_temporal_2020 = pd.merge(merge_append, deflactores_base2020_new , 4 left_on = ["departamento", "aÑo_x"], 5 right_on = ["dpto","aniorec"], 6 how = "left", 7 validate = "1:1")
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:107, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate) 90 @Substitution("\nleft : DataFrame or named Series") 91 @Appender(_merge_doc, indents=0) 92 def merge( (...) 105 validate: str | None = None, 106 ) -> DataFrame: --> 107 op = _MergeOperation( 108 left, 109 right, 110 how=how, 111 on=on, 112 left_on=left_on, 113 right_on=right_on, 114 left_index=left_index, 115 right_index=right_index, 116 sort=sort, 117 suffixes=suffixes, 118 copy=copy, 119 indicator=indicator, 120 validate=validate, 121 ) 122 return op.get_result()
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:704, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate) 696 ( 697 self.left_join_keys, 698 self.right_join_keys, 699 self.join_names, 700 ) = self._get_merge_keys() 702 # validate the merge keys dtypes. We may need to coerce 703 # to avoid incompatible dtypes --> 704 self._maybe_coerce_merge_keys() 706 # If argument passed to validate, 707 # check if columns specified as unique 708 # are in fact unique. 709 if validate is not None:
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:1257, in _MergeOperation._maybe_coerce_merge_keys(self) 1251 # unless we are merging non-string-like with string-like 1252 elif ( 1253 inferred_left in string_types and inferred_right not in string_types 1254 ) or ( 1255 inferred_right in string_types and inferred_left not in string_types 1256 ): -> 1257 raise ValueError(msg) 1259 # datetimelikes must match exactly 1260 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
ValueError: You are trying to merge on object and float32 columns. If you wish to proceed you should use pd.concat
# Ingreso per capita mensual merge_append["ingreso_mensual_pc"] = merge_append["inghog1d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"]) # Gasto per capita mensual merge_append["gasto_mensual_pc"] = merge_append["gashog2d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"]) # inghog1d: ingreso anual del hogar # gashog2d: gasto anual del hogar # mieperho: integrantes del hogar
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [10], in <cell line: 3>() 1 # Ingreso per capita mensual ----> 3 merge_append["ingreso_mensual_pc"] = merge_append["inghog1d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"]) 5 # Gasto per capita mensual 7 merge_append["gasto_mensual_pc"] = merge_append["gashog2d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"]) NameError: name 'merge_def_temporal_2020' is not defined

2) Salario por hora del trabajador dependiente

#!pip install weightedcalcs import os # for usernanme y set direcotrio import pandas as pd import numpy as np import weightedcalcs as wc # ponderador from tqdm import tqdm # controlar el tiempo en un loop user = os.getlogin() # Username " Read Stata dataset usando pandas" os.chdir(f"C:/Users/{user}/Documents/Documentos/Enaho") # Set directorio enaho_2020modulo5 = pd.read_stata(r"../../../737-Modulo05/Enaho01A-2020-500.dta") "Debemos colocar convert_categoricals=False. Esto por deafult es True" enaho_2020modulo5 = pd.read_stata(r"../../../737-Modulo05/Enaho01A-2020-500.dta", convert_categoricals=False) labels01 = pd.read_stata((r"../../../737-Modulo05/Enaho01A-2020-500.dta", convert_categoricals=False, iterator=True) "Elegimos la base de datos como Master Data: módulo 05: enaho_2020modulo5 = pd.read_stata(r"../../../737-Modulo05/Enaho01A-2020-500.dta", convert_categoricals=False) ## Filter ## index_columns = np.where(enaho_2020modulo5.columns.str.contains('i524e1', regex=True))[0] index_columns = np.where(enaho_2020modulo5.columns.str.contains('i538e1', regex=True))[0] index_columns = np.where(enaho_2020modulo5.columns.str.contains('i513t', regex=True))[0] index_columns = np.where(enaho_2020modulo5.columns.str.contains('i518', regex=True))[0] #Replace missing values enaho_2020modulo5["i524e1"].replace({0: np.nan}, inplace =True) enaho_2020modulo5["i538e1"].replace({0: np.nan}, inplace =True)
Input In [11] convert_categoricals=False, iterator=True) ^ SyntaxError: invalid syntax
# Gen urbano variable enaho_2020modulo5["ingreso"] = i524e1 + i538e1 enaho_2020modulo5["horasen_principal_y_2do_empleo"] = i513t + i518 enaho_2020modulo5["salario"] = ingreso/(horasen_principal_y_2do_empleo*52)

3) Groupby

!pip install weightedcalcs import os # for usernanme y set direcotrio import pandas as pd import numpy as np import weightedcalcs as wc # ponderador from tqdm import tqdm # controlar el tiempo en un loop #Extraemos la data del modulo 2 user = os.getlogin() # Username os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Lab7") # Set directorio enaho02 = pd.read_stata(r"../../../datos/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta", convert_categoricals=False)
#Agrupamos por hogar y vemos la edad máxima del hogar base2 = enaho02.groupby( [ "conglome", "vivienda", "hogar" ], as_index = False ).agg( edad_max = ( 'p208a', np.max )) #Extraemos la data del modulo 34 enaho34 = pd.read_stata(r"../../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta", convert_categoricals=False)
#Identificador por hogar: conglome, vivienda, hogar enaho_merge = pd.merge(base2, enaho34, on = ["conglome", "vivienda", "hogar"], how = "left", validate = "m:1", suffixes=('', '_y')) #Creamos la dummy base2["dummy_pension"] = np.where( enaho_merge["edad_max"] <=65 ,1, 0)