Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Trabajo_grupal/WG6/Grupo_4_py.py
2714 views
1
# -*- coding: utf-8 -*-
2
"""
3
###############################################################################
4
# #
5
# WG#6 - Grupo 4 #
6
# #
7
###############################################################################
8
9
Integrantes:
10
11
Luana Morales
12
Seidy Ascensios
13
Marcela Quintero
14
Flavia Oré
15
16
"""
17
18
#%% Pregunta 1 y 2 (Merge dataset Y Salario por hora del trabajador dependiente)
19
20
21
!pip install weightedcalcs
22
import os # for usernanme y set direcotrio
23
import pandas as pd
24
import numpy as np
25
import weightedcalcs as wc # ponderador
26
from tqdm import tqdm # controlar el tiempo en un loop
27
28
"1) Set Directorio"
29
30
user = os.getlogin() # Username
31
32
33
os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Trabajo_grupal/WG6") # Set directorio
34
35
36
37
"ENAHO 2020"
38
39
"2) Load dataset de ENAHO"
40
41
42
43
enaho_2020 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta")
44
45
46
enaho01 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",
47
convert_categoricals=False)
48
49
50
labels01 = pd.read_stata(r"../../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",
51
convert_categoricals=False, iterator=True)
52
53
54
55
56
enaho34 = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",
57
convert_categoricals=False)
58
59
60
labels34 = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",
61
convert_categoricals=False, iterator=True)
62
63
64
65
66
"4) Merge section"
67
68
"Left merge"
69
70
#enaho34: master data
71
#enaho01: using data
72
73
enaho_merge_2020 = pd.merge(enaho34, enaho01,
74
on = ["conglome", "vivienda", "hogar"],
75
how = "left",
76
validate = "m:1")
77
suffixes=('', '_y'),
78
)
79
80
index_columns = np.where( merge_base_2020.columns.str.contains('_y$', regex=True))[0]
81
82
merge_base_2020.drop(merge_base_2020.columns[index_columns], axis = 1, inplace = True)
83
84
85
"ENAHO 2019"
86
87
enaho_2019 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta")
88
89
90
enaho01_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta",
91
convert_categoricals=False)
92
93
94
labels01_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo01/737-Modulo01/enaho01-2019-100.dta",
95
convert_categoricals=False, iterator=True)
96
97
98
99
100
enaho34_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo34/737-Modulo34/sumaria-2019.dta",
101
convert_categoricals=False)
102
103
104
labels34_1 = pd.read_stata(r"../../../../enaho/2019/737-Modulo34/737-Modulo34/sumaria-2019.dta",
105
convert_categoricals=False, iterator=True)
106
107
"4) Merge section"
108
109
"Left merge"
110
111
#enaho34_1: master data
112
#enaho01_1: using data
113
114
enaho_merge_2019 = pd.merge(enaho34_1, enaho01_1,
115
on = ["conglome", "vivienda", "hogar"],
116
how = "left",
117
validate = "m:1")
118
suffixes=('', '_y'),
119
)
120
121
index_columns = np.where( merge_base_2019.columns.str.contains('_y$', regex=True))[0]
122
123
merge_base_2019.drop(merge_base_2019.columns[index_columns], axis = 1, inplace = True)
124
125
126
# Append
127
128
129
merge_append = merge_base_2020.append(merge_base_2019, ignore_index = True)
130
131
merge_append['dpto'] = merge_append['ubigeo'].str[:2]
132
133
#ignore_index= True : no haya conflictos de indexing
134
135
merge_append.to_stata("append_enaho.dta", write_index = False)
136
137
# Deflactar
138
139
deflactores_base2020_new = pd.read_stata(r"../../../../enaho/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta",
140
convert_categoricals=False)
141
142
143
"4) Merge section deflactores"
144
145
146
# merge usando como llaves a las variables dpto y aniorec.
147
#merge_append: master data
148
#deflactores_base2020_new: using data
149
150
merge_append_deflac = pd.merge(merge_append, deflactores_base2020_new,
151
on = ["dpto", "aNo"],
152
how = "left",
153
validate = "m:1")
154
155
156
merge_append_deflac["ingreso_month"] = merge_append_deflac["inghog1d"]/(12*merge_append_deflac["mieperho"]*merge_append_deflac[ld]*merge_append_deflac[i00])
157
158
merge_append_deflac["gasto_month"] = merge_append_deflac["gashog2d"]/(12*merge_append_deflac["mieperho"]*merge_append_deflac[ld]*merge_append_deflac[i00])
159
160
#%% Pregunta 3 - Group by
161
162
#Importamos los programas necesarios
163
164
import pandas as pd
165
from pandas import DataFrame,Series
166
import numpy as np
167
import weightedcalcs as wc
168
from tqdm import tqdm
169
import re
170
import os
171
172
# Seteamos el directorio
173
174
os.chdir(f"C:/Users/Marcela Quintero/Documents/GitHub/1ECO35_2022_2/Lab7")
175
176
#Leemos la base de datos
177
178
enaho_2 = pd.read_stata(r"../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta", convert_categoricals=False)
179
180
#Vemos la base de datos
181
182
enaho_2
183
184
185
#Vemos los labels antes de realizar el groupby
186
187
labels2 = pd.read_stata(r"../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta",
188
convert_categoricals=False, iterator=True)
189
190
labels2.variable_labels()
191
192
labels2.value_labels().keys()
193
194
enaho_2.keys()
195
196
197
#Utilizamos group by para seleccionar la mayor edad de cada hogar
198
199
200
hogares = enaho_2.groupby(['conglome','vivienda','hogar'],as_index=False).p208a.max()
201
202
203
print(hogares)
204
205
206
#Hacemos un merge con el módulo 34 para obtener los datos que nos faltan (pobreza)
207
208
#Para ello primero cargamos la base de datos (módulo 34) y obtenemos sus labels
209
210
enaho34 = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",
211
convert_categoricals=False)
212
213
labels34 = pd.read_stata(r"../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",
214
convert_categoricals=False, iterator=True)
215
216
labels34.variable_labels()
217
218
labels34.value_labels().keys()
219
220
enaho34.keys()
221
222
223
#Ahora filtramos la base con groupby para quedarnos solo con lo que necesitamos (el estado de pobreza)
224
225
hogares34 = enaho34.groupby(['conglome','vivienda','hogar'],as_index=False).pobreza.sum()
226
227
print (hogares34)
228
229
230
#Ahora, procedemos a realizar el merge entre hogares y hogares 34
231
232
enaho_merge = pd.merge(hogares, hogares34,
233
on = ["conglome", "vivienda", "hogar"],
234
how = "inner",
235
validate = "1:1")
236
237
print (enaho_merge)
238
239
240
#Procedemos a crear la dummy que verifica si el hogar es pobre y cuenta con algún miembro del hogar mayor a 65 años.
241
242
enaho_merge['dummy_pension'] = (enaho_merge['p208a'] >= 65) & (enaho_merge['pobreza'] < 3)
243
244
245
#VIsualizamos la dummy que es true si se cumplen ambas condiciones y false si no se cumplen
246
247
print(enaho_merge['dummy_pension'])
248
249
250
251
252
253
254
255
256
257
258