Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab7/script_merge_r.R
2714 views
1
################ laboratorio 7 ############################
2
## Curso: Laboratorio de R y Python ###########################
3
## @author: Roberto Mendoza
4
## Clean dataset
5
6
#install.packages("srvyr") para declarar encuestas en R (similar al svyset en stata)
7
8
9
#install.packages("fastDummies")
10
11
#Librerias de limpieza de datos
12
13
#pacman::p_load(haven,dplyr, stringr, fastDummies)
14
15
16
library(haven) # leer archivos spss, stata, dbf, etc
17
library(dplyr) # limpieza de datos
18
library(stringr) # grep for regular expression
19
library(fastDummies) # crear dummy
20
library(srvyr) # libreria para declarar el diseño muestral de una encuesta
21
library(survey)
22
23
24
# tydiverse: ggplot , dplyr other libraries
25
26
# Conglome : 1235 , vivienda: 10, hogar: 11 , codperso : 4 año 2019
27
28
# Conglome : 1235 , vivienda: 10, hogar: 11, codperso : 4 año 2015
29
30
31
32
"1.0 Set Directorio"
33
34
user <- Sys.getenv("USERNAME") # username
35
36
setwd( paste0("C:/Users/",user,"/Documents/GitHub/1ECO35_2022_2/Lab7") ) # set directorio
37
38
39
"2.0 Load dataset de ENAHO"
40
41
enaho01 <- read_dta("../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta")
42
43
# tibble dataset
44
45
enaho01$dominio
46
47
enaho01 <- data.frame(
48
49
read_dta("../../../enaho/2020/737-Modulo01/737-Modulo01/enaho01-2020-100.dta")
50
51
)
52
53
#data.frame dataset
54
55
enaho01
56
enaho01$dominio
57
58
# Check labels
59
60
# %>% Ctrl + shift + m
61
62
enaho01$estrato %>% attr('labels') # value labels
63
64
65
enaho01$factor07 %>% attr('label') # var label
66
67
names(enaho01)
68
69
# Weight sampling
70
71
enaho02 = data.frame(
72
read_dta("../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta")
73
)
74
75
names(enaho02)
76
77
length( unique(enaho02$facpob07) )
78
79
length( unique(enaho02$conglome) )
80
81
82
length( unique(enaho01$factor07) )
83
84
length( unique(enaho01$conglome) )
85
86
sum(enaho02$facpob07)
87
88
unique(enaho01$conglome) # La suma resulta en total de la población 2020 proyectada?
89
90
"Módulo02"
91
92
enaho02 = data.frame(
93
read_dta("../../../enaho/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta")
94
)
95
96
enaho03 = data.frame(
97
read_dta("../../../enaho/2020/737-Modulo03/737-Modulo03/enaho01a-2020-300.dta"))
98
99
enaho04 = data.frame(
100
read_dta("../../../enaho/2020/737-Modulo04/737-Modulo04/enaho01a-2020-400.dta")
101
)
102
103
enaho05 = data.frame(
104
read_dta("../../../enaho/2020/737-Modulo05/737-Modulo05/enaho01a-2020-500.dta")
105
)
106
107
enaho34 = data.frame(
108
read_dta("../../../enaho/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta")
109
)
110
111
enaho37 = data.frame(
112
read_dta("../../../enaho/2020/737-Modulo37/737-Modulo37/enaho01-2020-700.dta")
113
)
114
115
116
117
118
# Seleccionar variables
119
120
121
122
enaho02 <- enaho02[ , c("conglome", "vivienda", "hogar" , "codperso",
123
"ubigeo", "dominio" ,"estrato" ,"p208a", "p209",
124
"p207", "p203", "p201p" , "p204", "facpob07") ]
125
126
127
enaho03 <- enaho03[ , c("conglome", "vivienda", "hogar" , "codperso",
128
"p301a", "p301b", "p301c" , "p300a","p301b","p301c")]
129
130
enaho05 <- enaho05[ , c("conglome", "vivienda", "hogar" , "codperso",
131
"i524e1", "i538e1", "p558a5" , "i513t", "i518",
132
"p507", "p511a", "p512b", "p513a1", "p505" , "p506", "d544t", "d556t1",
133
"d556t2" , "d557t" , "d558t" , "ocu500" , "i530a" , "i541a")]
134
135
# 3.0 Merge section #
136
137
# Merge identifica automaticamente los casos de merge m:1, 1:1, 1:m
138
139
140
141
# _merge3 == 1,3
142
143
"Left merge"
144
145
#enaho02: master data
146
# enaho01: using data
147
148
enaho_merge <- merge(enaho02, enaho01,
149
by = c("conglome", "vivienda", "hogar"),
150
all.x = T
151
)
152
153
enaho_02_05 <- merge(enaho02, enaho05,
154
by = c("conglome", "vivienda", "hogar","codperso"),
155
all.x = T
156
)
157
158
159
# by: variable que permite identificar las observaciones en común en las bases de datos
160
# all.x : La base de datos preservará todas las observaciones de left data (enaho02)
161
# all.x tiene como valor predeterminado a False.
162
163
164
# all.x = False, all.y = False
165
166
# _merge3 == 2,3
167
168
169
enaho_02_05 <- merge(enaho02, enaho05,
170
by = c("conglome", "vivienda", "hogar","codperso"),
171
all.y = TRUE
172
)
173
174
175
# _merge3 == 3 (match inner)
176
177
enaho_merge_inner <- merge(enaho02, enaho01,
178
by = c("conglome", "vivienda", "hogar"),
179
all.x = F, all.y = F
180
)
181
182
183
184
enaho_merge_inner <- merge(enaho02, enaho01,
185
by = c("conglome", "vivienda", "hogar")
186
)
187
188
enaho_merge_02_05 <- merge(enaho02, enaho05,
189
by = c("conglome", "vivienda", "hogar","codperso")
190
)
191
192
193
# Match outer
194
195
196
enaho_merge_outer <- merge(enaho02, enaho05,
197
by = c("conglome", "vivienda", "hogar","codperso"),
198
all.x = T, all.y = T
199
)
200
201
202
enaho_merge_outer_2 <- merge(enaho02, enaho05,
203
by = c("conglome", "vivienda", "hogar","codperso"),
204
all= T
205
)
206
207
208
209
# suffixes
210
211
enaho_merge <- merge(enaho02, enaho01,
212
by = c("conglome", "vivienda", "hogar"),
213
all.x = T
214
)
215
216
217
names(enaho_merge)
218
219
220
221
enaho_merge <- merge(enaho02, enaho01,
222
by = c("conglome", "vivienda", "hogar"),
223
all.x = T, suffixes = c("","")
224
)
225
226
enaho_merge <- merge(enaho02, enaho01,
227
by = c("conglome", "vivienda", "hogar"),
228
all.x = T, suffixes = c("",".y")
229
)
230
231
232
names(enaho_merge)
233
234
#------------------------- Match with different keyword (variable llave) -------------------------
235
236
237
# rename variables que identifican de manera unica a cada hogar
238
239
enaho05 <- enaho05 %>% dplyr::rename(Conglo = conglome, viv = vivienda,
240
hog = hogar, cod = codperso)
241
242
243
244
245
enaho_02_05 <- merge(enaho02, enaho05,
246
by.x = c("conglome", "vivienda", "hogar","codperso"),
247
by.y = c("Conglo", "viv", "hog","cod"),
248
all = TRUE
249
)
250
251
252
253
# reset los nombre correctos de la variables que identificar cada hogar
254
255
enaho05 <- enaho05 %>% dplyr::rename(conglome = Conglo, vivienda = viv,
256
hogar = hog, codperso = cod)
257
258
259
260
#---------------------- Merge in Loop ------------------------------------
261
262
# <- shortcut Alt + -
263
264
num = list(enaho34 , enaho37) # lista de data.frames
265
266
merge_hog = enaho01 # Master Data
267
268
for (i in num){
269
270
merge_hog <- merge(merge_hog, i,
271
by = c("conglome", "vivienda", "hogar"),
272
all.x = T, suffixes = c("",".y")
273
)
274
}
275
276
names(merge_hog)
277
278
279
280
# Individual dataset
281
282
num = list(enaho03 , enaho04, enaho05 ) # lista de data.frames
283
284
merge_ind = enaho02 # Master Data
285
286
for (i in num){
287
288
merge_ind <- merge(merge_ind, i,
289
by = c("conglome", "vivienda", "hogar","codperso"),
290
all.x = T, suffixes = c("",".y")
291
)
292
}
293
294
names(merge_ind)
295
296
297
#----------------------- Merge Indivual and Hohar datasets -----------------------------------
298
299
# merge merge_hog and merge_ind
300
# mwrge_ind : master data
301
302
merge_base <- merge(merge_ind, merge_hog,
303
by = c("conglome", "vivienda", "hogar"),
304
all.x = T, suffixes = c("",".y"))
305
306
307
colnames(merge_base)
308
309
index <- grep(".y$", colnames(merge_base)) # Regular regular
310
311
# $ el texto finaliza con .y
312
313
merge_base_2020 <- merge_base[, - index]
314
315
colnames(merge_base_2020)
316
317
318
### Ubigeo de departamento
319
320
#ubigeo: 12 (ubigeo region junin, 1206 (provincia de satipo)
321
# 120601 (distrito de la provincia de satipo, region junin)
322
323
# sibstr permite sustraer digitos de un string, texto, caracter
324
325
merge_base_2020['ubigeo_dep'] = substr(merge_base_2020$ubigeo, 1, 2)
326
327
# a aprtir de la posición inicial, extraer los dos primeros digitos
328
329
merge_base_2020['ubigeo_dep_2'] = paste(substr(merge_base_2020$ubigeo,1,2),
330
"0000", sep = "")
331
332
333
### filtrado para algunos departamentos
334
335
merge_base_2020 <- merge_base_2020 %>% filter(
336
merge_base_2020$ubigeo_dep %in% c("15","03","04","12") )
337
338
#library(dplyr)
339
340
merge_base_2020 <- merge_base_2020 %>%
341
mutate(region = case_when(ubigeo_dep == "04" ~ "Arequipa",
342
ubigeo_dep == "03" ~ "Apurimac",
343
ubigeo_dep == "12" ~ "Junin",
344
ubigeo_dep == "15" ~ "Lima") )
345
346
347
348
349
"ENAHO 2019"
350
351
enaho01 <- data.frame(
352
read_dta("../../../datos/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta")
353
)
354
355
enaho02 = data.frame(
356
read_dta("../../../datos/2019/687-Modulo02/687-Modulo02/enaho01-2019-200.dta")
357
)
358
359
enaho03 = data.frame(
360
read_dta("../../../datos/2019/687-Modulo03/687-Modulo03/enaho01a-2019-300.dta"))
361
362
enaho04 = data.frame(
363
read_dta("../../../datos/2019/687-Modulo04/687-Modulo04/enaho01a-2019-400.dta")
364
)
365
366
enaho05 = data.frame(
367
read_dta("../../../datos/2019/687-Modulo05/687-Modulo05/enaho01a-2019-500.dta")
368
)
369
370
enaho34 = data.frame(
371
read_dta("../../../datos/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta")
372
)
373
374
enaho37 = data.frame(
375
read_dta("../../../datos/2019/687-Modulo37/687-Modulo37/enaho01-2019-700.dta")
376
)
377
378
379
# Seleccionar variables
380
381
382
enaho02 <- enaho02[ , c("conglome", "vivienda", "hogar" , "codperso",
383
"ubigeo", "dominio" ,"estrato" ,"p208a", "p209",
384
"p207", "p203", "p201p" , "p204", "facpob07")]
385
386
enaho03 <- enaho03[ , c("conglome", "vivienda", "hogar" , "codperso",
387
"p301a", "p301b", "p301c" , "p300a","p301b","p301c")]
388
389
enaho05 <- enaho05[ , c("conglome", "vivienda", "hogar" , "codperso",
390
"i524e1", "i538e1", "p558a5" , "i513t", "i518",
391
"p507", "p511a", "p512b", "p513a1", "p505" , "p506", "d544t",
392
"d556t1",
393
"d556t2" , "d557t" , "d558t" , "ocu500" , "i530a" , "i541a")]
394
395
396
397
398
num = list(enaho34 , enaho37) # lista de data.frames
399
400
merge_hog = enaho01 # Master Data
401
402
for (i in num){
403
404
merge_hog <- merge(merge_hog, i,
405
by = c("conglome", "vivienda", "hogar"),
406
all.x = T, suffixes = c("",".y")
407
)
408
}
409
410
# Individual dataset
411
412
num = list(enaho03 , enaho04, enaho05 ) # lista de data.frames
413
414
merge_ind = enaho02 # Master Data
415
416
for (i in num){
417
418
merge_ind <- merge(merge_ind, i,
419
by = c("conglome", "vivienda", "hogar","codperso"),
420
all.x = T, suffixes = c("",".y")
421
)
422
}
423
424
425
426
merge_base <- merge(merge_ind, merge_hog,
427
by = c("conglome", "vivienda", "hogar"),
428
all.x = T, suffixes = c("",".y"))
429
430
index <- grep(".y$", colnames(merge_base))
431
432
433
merge_base_2019 <- merge_base[, - index]
434
435
436
437
#----------------------- Append -----------------------------------
438
439
440
merge_append <- bind_rows(merge_base_2020, merge_base_2019) # bind_rows from dyplr
441
442
443
unique(merge_append$aÑo)
444
445
# bind_rows from dplyr library
446
447
448
write_dta(merge_append, "../data/append_enaho_r.dta")
449
450
451
#------------------------ Poverty and dummies -------------------------------
452
453
#Ingreso nominal percapita mensual y gasto nominal mensual percapital del hogar
454
455
# inghog1d: ingreso anual bruto del hogar (incluye ingresos en forma de bienes)
456
# gashog2d: gasto anual bruto hogar
457
458
# Estas variables provienen del módulo 34 - sumaria (módulo de variables calculadas)
459
# Linea de pobreza
460
# mieperho: miembros del hogar
461
# Excluye a los trabajadores domésticos y a las personas que subarriendan una habitación en el hogar
462
463
464
merge_base_2020 <- merge_base_2020 %>%
465
dplyr::mutate(ingreso_month_pc = inghog1d/(12*mieperho),
466
gasto_month_pc = gashog2d/(12*mieperho)
467
) %>%
468
dplyr::mutate(dummy_pobre = ifelse( gasto_month_pc < linea ,
469
1 ,
470
0 ) ) %>%
471
dplyr::mutate(pobre = ifelse( gasto_month_pc < linea ,
472
"pobre" ,
473
"No pobre") ) %>%
474
dplyr::mutate(pc_pobre = case_when(pobreza == 1 ~ "Pobre extremo",
475
pobreza == 2 ~ "Pobre",
476
pobreza == 3 ~ "No pobre"))
477
478
# Si existe missing values en las variables usadas en el condicional
479
# entonces R colocará missing, esto no es directo en python
480
481
482
# Ejemplo adicional
483
484
var1 <- c(NA,2,3)
485
486
var2 <- c(400,1,5)
487
488
# creamos una base de datos en formato tibble similar a un data.frame
489
490
base <- tibble(
491
var1, var2
492
493
)
494
495
496
# aplicamos mutate para crear la dummy
497
498
base %>% mutate(
499
500
Dummy = ifelse(var1 < var2, 1,0)
501
502
)
503
504
505
sum(is.na(merge_base_2020$gashog2d)) # no hay missing en la variables gasto anual del hogar
506
507
508
#creando dummies usando la variabe de nivel educativo alcanzado p301a
509
510
merge_base_2020 <- dummy_cols(merge_base_2020, select_columns = 'p301a')
511
512
513
View(merge_base_2020[, c("p301a","p301a_1","p301a_2","p301a_3","p301a_4","p301a_5")])
514
515
516
517
################ Colappse #############################################
518
519
520
# Tab in R from dplyr library
521
522
count(merge_base_2020, pobreza, sort = TRUE)
523
524
count(merge_base_2020, pc_pobre, sort = F)
525
526
527
528
#Alternativa de tab (STATA) en R
529
530
table(merge_base_2020$pc_pobre)
531
532
table(merge_base_2020$p301a)
533
534
merge_base_2020 %>% dplyr::filter(!is.na(p301a)) %>% group_by(p301a) %>% summarise(Freq.abs = n()) %>%
535
mutate(Freq.relative = (Freq.abs/sum(Freq.abs))*100) %>% arrange(desc(Freq.relative))
536
537
# arrange de la libreria dplyr permite ordenar una variable
538
# dplyr::filter pues al instalar las librerias, R indica de conflicto en el nombre de funciones
539
# en librerias diferentes. El código significa que que usará la función o método filter de la librearia dplyr
540
541
542
df1 <- merge_base_2020 %>% group_by(conglome, vivienda, hogar ) %>%
543
summarise(
544
edu_min = min(p301a),
545
sup_educ = sum(p301a_10), total_miembros = n(),
546
edu_max = max(p301a), .groups = "keep"
547
)
548
549
550
551
# sin considerar los missing
552
553
df1_no_missing <- merge_base_2020 %>% group_by(conglome, vivienda, hogar ) %>%
554
summarise(
555
edu_min = min(p301a, na.rm = TRUE),
556
sup_educ = sum(p301a_10, na.rm = T), total_miembros = n(),
557
edu_max = max(p301a, na.rm = T),
558
)
559
560
561
562
# La advertencia surge por que se están agrupando por varias variables.
563
# Para evitar el mensaje, debemos incluir el argumento .groups = "keep"
564
565
df2 <- merge_base_2020 %>% group_by(conglome, vivienda, hogar ) %>%
566
summarise(
567
edu_min = min(p301a, na.rm = TRUE),
568
sup_educ = sum(p301a_10, na.rm = T), total_miembros = n(),
569
edu_max = max(p301a, na.rm = T), .groups = "keep"
570
)
571
572
573
# max(p301a, na.rm = T), na.rm = T causa que R no tome en cuenta a los missings
574
575
576
# na.rm permite ignorar los missing en las operaciones mean, sum, max
577
578
579
df3 <- merge_base_2020 %>% group_by(ubigeo_dep, region) %>%
580
summarise(index_poverty = mean(dummy_pobre, na.rm = T), .groups = "keep" )
581
582
583
class(merge_base_2020$p505)
584
585
586
#----------------- Indicadores socieconómicos ------------------------
587
588
# Primero indicamos a R que nuestra base de datos es una encuesta
589
# Para ello demebos declarar el diseño de la encuesta
590
# ids: conglomerado, strato: estrato y wieght : factor de expansión
591
592
survey_enaho <- merge_base_2020 %>% as_survey_design(ids = conglome, strata = estrato,
593
weight = facpob07)
594
595
#facpob07: factor de expansión a nivel población. Esto se constriye a partir de información Censo 2017
596
597
598
names(merge_base_2020)
599
600
ind1 <- survey_enaho %>% dplyr::filter(p208a >= 10 & p208a<= 65) %>% # me quedo con personas de 10 a 65 años
601
mutate(
602
g1 = ifelse(p208a>=10 & p208a <=20,1,0), # dummies por grupos de edad
603
g2 = ifelse(p208a>20 & p208a <=30,1,0),
604
g3 = ifelse(p208a >30 & p208a <=40,1,0),
605
g4 = ifelse(p208a >40 & p208a <=65,1,0),
606
607
) %>% group_by(region) %>% # indicadores de grupo de edad y nivel educativo
608
# indicadores a nivel regional
609
610
summarise(
611
612
gp1 = survey_mean(g1), gp2 = survey_mean(g2), gp3 = survey_mean(g3),
613
gp4 = survey_mean(g4),
614
g_sec = survey_mean(p301a_6, na.rm = T), g_uni_co = survey_mean(p301a_10, na.rm = T)
615
616
)
617
618
619
620
merge_base_2020$estrato
621
622
merge_base_2020$dominio
623
624
625
#------- Libreria Survey -------
626
627
# Se declara el diseño muestral
628
629
630
survey_enaho <- svydesign(id=~conglome, weights=~facpob07,strata=~estrato, data=merge_base_2020)
631
632
633
634
635
636