Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab7/script_merge_py.py
2714 views
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Sat Sep 24
4
5
@author: Roberto
6
7
@script: Clean ENAHO
8
"""
9
10
#!pip install weightedcalcs
11
12
import os # for usernanme y set direcotrio
13
import pandas as pd
14
import numpy as np
15
import weightedcalcs as wc # ponderador
16
from tqdm import tqdm # controlar el tiempo en un loop
17
18
19
user = os.getlogin() # Username
20
21
22
os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Lab7") # Set directorio
23
24
# Set directorio
25
#%% Merge
26
27
28
" Read Stata dataset usando pandas"
29
30
"Se puede observar que pandas lee las etiquetas de las valores de cada variables"
31
32
enaho_2020 = pd.read_stata(r"../../../datos/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta")
33
34
"Debemos colocar convert_categoricals=False. Esto por deafult es True"
35
" De esta manera respetará los value's label "
36
37
38
enaho01 = pd.read_stata(r"../../../datos/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",
39
convert_categoricals=False)
40
41
42
labels01 = pd.read_stata(r"../../../datos/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta",
43
convert_categoricals=False, iterator=True)
44
45
46
47
labels01.variable_labels()
48
49
labels01.value_labels().keys()
50
51
labels01.value_labels()['p110']
52
53
54
#identificador por miembro del hogar: conglome, vivienda, hogar, codperso
55
56
"Elegimos la base de datos como Master Data: módulo 02: características de los miembros del hogar"
57
58
enaho02 = pd.read_stata(r"../../../datos/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta",
59
convert_categoricals=False)
60
61
"Hacemos merge con el resto de base módulos (using data)"
62
63
"Presenta información de coordenadas"
64
65
enaho03 = pd.read_stata(r"../../../datos/2020/737-Modulo03/737-Modulo03/enaho01a-2020-300.dta",
66
convert_categoricals=False)
67
68
69
enaho04 = pd.read_stata(r"../../../datos/2020/737-Modulo04/737-Modulo04/enaho01a-2020-400.dta",
70
convert_categoricals=False)
71
72
73
enaho05 = pd.read_stata(r"../../../datos/2020/737-Modulo05/737-Modulo05/enaho01a-2020-500.dta",
74
convert_categoricals=False)
75
76
enaho34 = pd.read_stata(r"../../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta",
77
convert_categoricals=False)
78
79
80
enaho37 = pd.read_stata(r"../../../datos/2020/737-Modulo37/737-Modulo37/enaho01-2020-700.dta",
81
convert_categoricals=False)
82
83
# labels
84
85
labels37 = pd.read_stata(r"../../../datos/2020/737-Modulo37/737-Modulo37/enaho01-2020-700.dta",
86
convert_categoricals=False, iterator=True)
87
88
labels37.variable_labels()
89
90
91
# identificador por hogar: conglome, vivienda, hogar
92
93
enaho_merge = pd.merge(enaho02, enaho01,
94
on = ["conglome", "vivienda", "hogar"],
95
how = "left",
96
validate = "m:1")
97
98
99
enaho_merge['latitud'].isna().sum()
100
101
# enaho02: base de datos con información de miembros del hogar
102
# enaho01: base de datos a nivel de hoagres
103
# on: variable que permite identificar las observaciones en común en las bases de datos
104
# how: cómo se realizará el merge
105
# validate: modo de unificar las bases de datos.
106
107
#cols_to_use = enaho02.columns.difference(enaho01.columns)
108
109
enaho_merge = pd.merge(enaho02, enaho01,
110
on = ["conglome", "vivienda", "hogar"],
111
how = "left",
112
validate = "m:1", suffixes=('', '_y'))
113
114
# suffixes: renombrar las variables comunes en las bases de datos
115
116
# merge que selecciona las variables en el using data
117
118
enaho_merge_2 = pd.merge(enaho02, enaho01[["conglome", "vivienda", "hogar",'longitud','latitud']],
119
on = ["conglome", "vivienda", "hogar"],
120
how = "left",
121
validate = "m:1")
122
123
#%% Tipos de Merge
124
125
126
# output basico de STATA (_merge =1,3) Keepus Master
127
128
enaho_merge_left = pd.merge(enaho01, enaho37,
129
on = ["conglome", "vivienda", "hogar"],
130
how = "left",
131
validate = "1:1")
132
133
134
135
# output basico de STATA (_merge =2,3) Keepus using
136
137
enaho_merge_right = pd.merge(enaho01, enaho37,
138
on = ["conglome", "vivienda", "hogar"],
139
how = "right",
140
validate = "1:1")
141
142
143
# output basico de STATA (_merge =3)
144
145
enaho_merge_inner = pd.merge(enaho01, enaho37,
146
on = ["conglome", "vivienda", "hogar"],
147
how = "inner",
148
validate = "1:1")
149
150
151
152
enaho_merge_outer = pd.merge(enaho01, enaho37,
153
on = ["conglome", "vivienda", "hogar"],
154
how = "outer",
155
validate = "1:1")
156
157
datos= np.array([enaho_merge_left.shape[0],
158
enaho_merge_right.shape[0],
159
enaho_merge_inner.shape[0],
160
enaho_merge_outer.shape[0],
161
])
162
163
pd.DataFrame(index = ['Left','Right','Inner','Outer'],
164
data = datos, columns = ["observaciones"]
165
)
166
167
#
168
169
# output basico de STATA (_merge = 1,2,3)
170
171
enaho_merge_3 = pd.merge(enaho01, enaho37,
172
on = ["conglome", "vivienda", "hogar"],
173
how = "outer",
174
validate = "1:1")
175
176
enaho_merge_3.shape
177
178
179
# merge using individual dataset
180
181
#%% Merge using different Key variables
182
183
184
185
enaho03.rename(columns={"conglome":"cong", "vivienda":"viv", "hogar":"hog","codperso":"perso"},
186
inplace = True)
187
188
merge_1 = pd.merge(enaho02, enaho03,
189
left_on = ["conglome", "vivienda", "hogar","codperso"],
190
right_on = ["cong","viv","hog","perso"],
191
how = "left",
192
validate = "1:1")
193
194
195
enaho03.rename(columns={"cong":"conglome", "viv":"vivienda", "hog":"hogar","perso":"codperso"},
196
inplace = True)
197
198
199
200
## merge con dataset a nivel hogar enaho01, enaho34, enaho37
201
202
203
num = ["34","37"]
204
205
merge_hog = enaho01
206
207
merge_hog['ubigeo_dep2'] = merge_hog['ubigeo'].str[:2]
208
merge_hog['ubigeo_dep6'] = merge_hog['ubigeo'].str[:2]+"0000"
209
210
merge_hog = merge_hog[merge_hog.ubigeo_dep2.isin(["15","03","04"])]
211
212
213
214
for i in tqdm(num):
215
merge_hog = pd.merge(merge_hog, globals()[f'enaho{i}'],
216
on = ["conglome", "vivienda", "hogar"],
217
how = "left",
218
suffixes=('', '_y'),
219
validate = "1:1")
220
221
222
223
# Merge a nivel miembros del hogar
224
225
num = ["03"]
226
227
merge_ind = enaho02 # modulo de personas
228
229
merge_ind['ubigeo_pr'] = merge_ind['ubigeo'].str[:2]
230
231
merge_ind= merge_ind[merge_ind.ubigeo_pr.isin(["15","03","04"])]
232
233
234
# glablas para que python entienda que trabajamos una un dataset en el loop
235
236
for i in tqdm(num):
237
merge_id = pd.merge(merge_ind, globals()[f'enaho{i}'],
238
on = ["conglome", "vivienda", "hogar","codperso"],
239
how = "left",
240
suffixes=('', '_y'),
241
validate = "1:1")
242
243
244
# Merge hogares e individuos
245
246
merge_base_2020 = pd.merge(merge_id, merge_hog,
247
on = ["conglome", "vivienda", "hogar"],
248
how = "left",
249
validate = "m:1",
250
suffixes=('', '_y'),
251
)
252
253
# drop varibales que terminan en _y
254
255
index_columns = np.where( merge_base_2020.columns.str.contains('_y$', regex=True))[0]
256
257
merge_base_2020.drop(merge_base_2020.columns[index_columns], axis = 1, inplace = True)
258
259
260
merge_base_2020['linea']
261
262
263
###########################################
264
############# Merge 2019 ##################
265
###########################################
266
267
268
enaho01 = pd.read_stata(r"../../../datos/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta",
269
convert_categoricals=False)
270
271
enaho02 = pd.read_stata(r"../../../datos/2019/687-Modulo02/687-Modulo02/enaho01-2019-200.dta",
272
convert_categoricals=False)
273
274
enaho03 = pd.read_stata(r"../../../datos/2019/687-Modulo03/687-Modulo03/enaho01a-2019-300.dta",
275
convert_categoricals=False)
276
277
enaho04 = pd.read_stata(r"../../../datos/2019/687-Modulo04/687-Modulo04/enaho01a-2019-400.dta",
278
convert_categoricals=False)
279
280
281
enaho05 = pd.read_stata(r"../../../datos/2019/687-Modulo05/687-Modulo05/enaho01a-2019-500.dta",
282
convert_categoricals=False)
283
284
enaho34 = pd.read_stata(r"../../../datos/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta",
285
convert_categoricals=False)
286
287
288
enaho37 = pd.read_stata(r"../../../datos/2019/687-Modulo37/687-Modulo37/enaho01-2019-700.dta",
289
convert_categoricals=False)
290
291
292
enaho02 = enaho02[["conglome", "vivienda", "hogar" , "codperso",
293
"ubigeo", "dominio" ,"estrato" ,"p208a", "p209",
294
"p207", "p203", "p201p" , "p204", "facpob07"]]
295
296
enaho03 = enaho03[["conglome", "vivienda", "hogar" , "codperso",
297
"p301a", "p301b", "p301c" , "p300a"]]
298
299
300
enaho05 = enaho05[["conglome", "vivienda", "hogar" , "codperso",
301
"i524e1", "i538e1", "p558a5" , "i513t", "i518",
302
"p507", "p511a", "p512b", "p513a1", "p505" , "p506", "d544t", "d556t1",
303
"d556t2" , "d557t" , "d558t" , "ocu500" , "i530a" , "i541a"]]
304
305
num = ["34","37"]
306
307
merge_hog = enaho01
308
309
merge_hog['ubigeo_pr'] = merge_hog['ubigeo'].str[:2]
310
merge_hog['ubigeo_pr'] = merge_hog['ubigeo'].str[:2]+"0000"
311
312
313
merge_hog = merge_hog[merge_hog.ubigeo_pr.isin(["15","03","04","12"])]
314
315
316
for i in tqdm(num):
317
merge_hog = pd.merge(merge_hog, globals()[f'enaho{i}'],
318
on = ["conglome", "vivienda", "hogar"],
319
how = "left",
320
suffixes=('', '_y'),
321
validate = "1:1")
322
323
324
num = ["03"]
325
merge_ind = enaho02
326
327
merge_ind['ubigeo_pr'] = merge_ind['ubigeo'].str[:2]
328
329
merge_ind= merge_ind[merge_ind.ubigeo_pr.isin(["15","03","04","12"])]
330
331
for i in tqdm(num):
332
merge_ind = pd.merge(merge_ind, globals()[f'enaho{i}'],
333
on = ["conglome", "vivienda", "hogar","codperso"],
334
how = "left",
335
suffixes=('', '_y'),
336
validate = "1:1")
337
338
# Merge hogares e individuos
339
340
merge_base_2019 = merge_ind.merge(merge_hog,
341
on = ["conglome", "vivienda", "hogar"],
342
how = "left",
343
validate = "m:1",
344
suffixes=('', '_y'),
345
)
346
347
## Drop variables que termina en _y ##
348
349
index_columns = np.where( merge_base_2019.columns.str.contains('_y$', regex=True))[0]
350
351
merge_base_2019.drop(merge_base_2019.columns[index_columns], axis = 1, inplace = True)
352
353
354
355
356
357
#%% Append
358
359
360
merge_append = merge_base_2020.append(merge_base_2019, ignore_index = True)
361
362
363
#ignore_index= True : no haya conflictos de indexing
364
365
merge_append.to_stata("../../../append_enaho.dta", write_index = False)
366
367
# write_index=False: no guardar con una columan de index
368
369
370
#%% Poverty mesure
371
372
# 1) Ingreso per capita mensual
373
# 2) Gasto per capita mensual
374
375
# inghog1d: ingreso anual del hogar
376
# gashog2d: gasto anual del hogar
377
# mieperho: integrantes del hogar
378
# ingreso_month: ingreso per capita mensual
379
# gasto_month: gasto per capita mensual
380
381
merge_base_2020["ingreso_month"] = merge_base_2020["inghog1d"]/(12*merge_base_2020["mieperho"])
382
383
merge_base_2020["gasto_month"] = merge_base_2020["gashog2d"]/(12*merge_base_2020["mieperho"])
384
385
386
#Generamos variable "pobre" mediante comparación gasto y linea de pobreza
387
388
merge_base_2020["pobre"] = np.where(
389
merge_base_2020["gasto_month"] < merge_base_2020["linea"],
390
"pobre", "no pobre")
391
392
# En este caso gasto_month ni linea presentan missing, si alguno tuviera missing,
393
# entonces la dummy debe generar missing
394
395
merge_base_2020["pobre"] = np.where(
396
merge_base_2020[['gasto_month','linea']].isnull().any(axis=1), np.nan, # si al menos uno es missing, se coloca nan
397
np.where( # caso se contrario se aplica la condición que me interesa
398
merge_base_2020["gasto_month"] < merge_base_2020["linea"],
399
"pobre",
400
"no pobre")
401
)
402
403
# value_counts hallar la frecuancia absoluta segun los valores de la variables
404
405
merge_base_2020["pobre"].value_counts()
406
407
408
merge_base_2020["pobre"].unique()
409
410
# np.where(Condicioón, colocar v si es verdadero, colcoar w si es falso)
411
412
merge_base_2020["dummy_pobre"] = np.where(
413
merge_base_2020["gasto_month"] < merge_base_2020["linea"],
414
1, 0)
415
416
417
# tomando en cuenta si existe algún missing
418
419
merge_base_2020["dummy_pobre"] = np.where(
420
merge_base_2020[['gasto_month','linea']].isnull().any(axis=1), np.nan, # si al menos uno es missing, se coloca nan
421
np.where( # caso se contrario se aplica la condición que me interesa
422
merge_base_2020["gasto_month"] < merge_base_2020["linea"],
423
1,
424
0)
425
)
426
427
428
#Generamos variable "pc_pobre" recodificando variable "pobreza"
429
430
431
merge_base_2020["pc_pobre"] = merge_base_2020["pobreza"].replace({1: "Pobre extremo",
432
2: "Pobre",
433
3: "No pobre"})
434
435
merge_base_2020["pobreza"]
436
merge_base_2020["pc_pobre"]
437
438
merge_base_2020["pc_pobre"].unique()
439
440
# pobreza (1 pobre extremo, 2 pobre no extremo, 3 no pobre)
441
442
###############################################
443
################ Dummies ######################
444
###############################################
445
446
merge_base_2020["p301a"].unique()
447
merge_base_2020["p301a"].value_counts()
448
449
merge_base_2020["p301a"].replace({np.nan: 99}, inplace =True)
450
451
452
# reempalzar 99 de valores perdidos por missing
453
454
merge_base_2020["p301a"].replace({99:np.nan}, inplace =True)
455
456
457
merge_base_2020["p301a"].unique()
458
merge_base_2020["p301a"].value_counts()
459
460
461
#Replace missing values
462
463
merge_base_2020["p301a"].replace({99: np.nan}, inplace =True)
464
465
#Generate dummies
466
467
pd.get_dummies(merge_base_2020["p301a"])
468
469
470
# juntar las varibales dummies en el dataframe
471
472
levels = len(merge_base_2020["p301a"].unique()) - 1 # no tomar en cuenta la cageoria NA
473
474
475
476
merge_base_2020[ [f"var_{i+1}" for i in range(levels)] ] = pd.get_dummies(merge_base_2020["p301a"])
477
478
479
# alternativa
480
481
merge_base_2020 = pd.concat([ merge_base_2020 , pd.get_dummies(merge_base_2020["p301a"]) ], axis = 1 )
482
483
# axis = 1, se junta las bases de forma horizontal
484
485
merge_base_2020.columns
486
487
merge_base_2020[10]
488
489
490
#%% Collapse - groupby
491
492
# maximo nivel educativo alcanzado, dummy si educación superior, menor nivel alcanzado
493
494
df1 = merge_base_2020.groupby( [ "conglome", "vivienda", "hogar" ]).agg( edu_max = ( 'p301a', np.max ) ,
495
edu_min = ( 'p301a', np.min ) ,
496
total_miembros1 = ('conglome', np.count), # Contabiliza incluso los missings
497
total_miembros2 = ('conglome', np.size), # ignore NA missing
498
sup_educ = ( 'var_10', np.sum ))
499
500
# ignorar missing np.namax, np.nasum
501
502
df1 = merge_base_2020.groupby( [ "conglome", "vivienda", "hogar" ]).agg( edu_max = ( 'p301a', np.nanmax ) ,
503
edu_min = ( 'p301a', np.nanmin ) ,
504
total_miembros2 = ('conglome', np.size), # ignore NA missing
505
sup_educ = ( 'var_10', np.nasum ))
506
# as_index = true (default), las varibales de agrupamiento son variables indenxing
507
508
509
df1['vivienda'] # no existe
510
511
df1['hogar'] # no existe
512
513
# var_10: universitaria completa
514
515
# "" var_10
516
# 1023 12 01 1
517
# 1023 12 01 0
518
# 1023 12 01 1
519
# ""
520
521
522
# as_index = False genera que "conglome", "vivienda", "hogar" sea parte de la base de datos
523
524
df1 = merge_base_2020.groupby( [ "conglome", "vivienda", "hogar" ],
525
as_index = False ).agg( edu_max = ( 'p301a', np.nanmax ) ,
526
edu_min = ( 'p301a', np.nanmin ) ,
527
total_miembros = ('conglome', np.size), # count no missings
528
sup_educ = ( 'var_10', np.nansum ))
529
530
531
df1['vivienda'] # se puede verificar que vivienda pertenece a la base de datos
532
533
534
df2 = merge_base_2020.groupby( [ "ubigeo_dep2" ],
535
as_index = False ).agg( index_poverty = ( 'dummy_pobre', np.nanmean ))
536
537
538
# Dummy
539
# 1 0
540
# 2 1
541
# 3 0
542
# 4 1
543
544
# (1+0+1+0)/4 = 2/4 = 0.5, promedio de una dummy es un porcentaje !!
545
546
547
merge_base_2020["dpto"] = merge_base_2020["ubigeo_dep2"].replace({
548
"15": "Lima","03": "Apurimac","04": "Arequipa"
549
})
550
551
552
df3 = merge_base_2020.groupby( [ "dpto" ],
553
as_index = False ).agg( index_poverty = ( 'dummy_pobre', np.nanmean ))
554
555
556
557
#Tabla de comparación value_counts similar tab in stata
558
559
560
561
print("*-----------------------------------*")
562
print("Comparación de variables de pobreza")
563
print("*-----------------------------------*")
564
print("Pobreza")
565
print(merge_base_2020["pobre"].value_counts())
566
print("*-----------------------------------*")
567
print("pc pobre")
568
print(merge_base_2020["pc_pobre"].value_counts())
569
print("*-----------------------------------*")
570
print("Dummy pobre")
571
print(merge_base_2020["dummy_pobre"].value_counts())
572
print("*-----------------------------------*")
573
574
575
# Cross tab (tabla cruzada) tab var1 var 2 in stata
576
577
pd.crosstab(merge_base_2020["dpto"], merge_base_2020["dummy_pobre"])
578
579
#Generamos tablas sin ponderador
580
581
pd.crosstab([merge_base_2020["estrsocial"]],
582
merge_base_2020["pc_pobre"] , margins=True)
583
584
#Tasa de pobreza usando factor expansión / ponderador facpob07
585
586
calc = wc.Calculator("facpob07")
587
588
apurimac = merge_base_2020[ merge_base_2020["dpto"] == "Apurimac" ]
589
590
# Ahora la tasa de pobreza segun (pobre extremo, pobre o no pobre) pues toma en cuenta
591
# el factor de expansión
592
593
calc.distribution(apurimac,"pc_pobre").round(3).sort_values(ascending=False)
594
595
596
"References: "
597
598
# https://pandas.pydata.org/docs/reference/api/pandas.read_stata.html
599
600
601
602
603