Path: blob/main/Trabajo_grupal/WG6/Solución/solucion_r.R
2835 views
################ Solución R ############################123# clear environment45rm(list=ls(all=TRUE))67# load libraries89librarian::shelf(tidyverse,haven,srvyr)101112# 1.0 merge Datasets ----------------------------------------------1314user <- Sys.getenv("USERNAME") # username1516setwd( paste0("C:/Users/",user,"/Documents/GitHub/1ECO35_2022_2") ) # set directorio171819enaho_19_01 <- read_dta("../../datos/2019/687-Modulo01/687-Modulo01/enaho01-2019-100.dta")202122enaho_19_34 <- read_dta("../../datos/2019/687-Modulo34/687-Modulo34/sumaria-2019.dta")232425deflactor <- read_dta(26"../../datos/2020/737-Modulo34/737-Modulo34/ConstVarGasto-Metodologia actualizada/Gasto2020/Bases/deflactores_base2020_new.dta")272829sapply(deflactor, class)3031class(enaho_19_01$aÑo)3233## Año 2019 ##3435# Sumaria3637enaho_19_34 <- enaho_19_34 |> dplyr::select(conglome,vivienda,hogar,mieperho,inghog1d,gashog2d,ld)383940# Modelo 1 (caracteristica de la vivienda y del hogar)4142enaho_merge_19 <- enaho_19_01 |> dplyr::rename(year = aÑo) |>43select(year,conglome,vivienda,hogar,ubigeo) |>44left_join(enaho_19_34,45by = c("conglome","vivienda","hogar"))464748# Año 2020 ##495051enaho_20_01 <- read_dta("../../datos/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta")525354enaho_20_34 <- read_dta("../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta")555657enaho_20_34 <- enaho_20_34 |> dplyr::select(conglome,vivienda,hogar,mieperho,inghog1d,gashog2d,ld)58596061enaho_append <- enaho_20_01 |> dplyr::rename(year = aÑo) |>62select(year,conglome,vivienda,hogar,ubigeo) |> # seleccion de variables63left_join(enaho_20_34, by = c("conglome","vivienda","hogar")) |> # left merge64bind_rows(enaho_merge_19) |> # append65mutate(dep = as.numeric(substr(ubigeo, 1, 2)), # creamos codigo solo departamento y convertimos a numeric66year = as.numeric(year) # year de string a numeric67) |>68left_join(deflactor, by = c("year"="aniorec","dep"="dpto")) |> # merge deflactor69mutate(ing_pc_real = inghog1d/(12*ld*i00*mieperho),70gas_pc_real = gashog2d/(12*ld*i00*mieperho)) # creacion de variables deflactadas71727374# Salario por hora ---------------------------------------------757677enaho_20_05 <- read_dta("../../datos/2020/737-Modulo05/737-Modulo05/enaho01a-2020-500.dta")787980enaho_20_05 <- enaho_20_05 |> select(conglome,vivienda,hogar,i524e1, i538e1,i513t, i518) |>81rowwise() |> # permite aplicar luego suma fila por fila82mutate(suma_ingreso = sum(i524e1, i538e1, na.rm = TRUE), # na.rm ignores NA83total_horas = sum(i513t, i518, na.rm = TRUE),84hour_wage = suma_ingreso/(52*total_horas),85hour_wage = replace(hour_wage, which(hour_wage %in% c(0,NaN)) , NA))8687# which permite reemplazar con rapidez888990# Groupby -------------------------------------------------------------919293enaho_20_02 <- read_dta("../../datos/2020/737-Modulo02/737-Modulo02/enaho01-2020-200.dta")9495enaho_20_34 <- read_dta("../../datos/2020/737-Modulo34/737-Modulo34/sumaria-2020.dta")969798df <- enaho_20_02 |> select(conglome, vivienda, hogar, p208a ) |>99group_by(conglome, vivienda, hogar) |>100summarise(edad_max = max(p208a, na.rm = T), .groups = "keep")101102# Si quiero observar la mayor edad como dato en cada fila103104df2 <- enaho_20_02 |> select(conglome, vivienda, hogar, p208a ) |>105group_by(conglome, vivienda, hogar) |>106summarise(edad_max = max(p208a, na.rm = T), .groups = "keep") |> ungroup()107108# Si quiero observar el dato de mayor edad en cada fila y con todas las demás variables de la base109110df3 <- enaho_20_02 |> select(conglome, vivienda, hogar, p208a ) |>111group_by(conglome, vivienda, hogar) |>112mutate(edad_max = max(p208a, na.rm = T))113114115116# Merge Sumaria (modulo 34)117118enaho_pension <- df |> left_join(enaho_20_34, by = c("conglome", "vivienda", "hogar")) |>119mutate(hogar_benf_pen = ifelse(edad_max >= 65 & (pobreza %in% c(1,2)), 1, 0))120121# Ifelse coloca missing si edad_max o pobreza es missing122123124table(enaho_pension$hogar_benf_pen)125126127# Se verifica que coincide con Python128129# Indicadores ----130131# Programas sociales132133enaho_20_37 <- read_dta("../../datos/2020/737-Modulo37/737-Modulo37/enaho01-2020-700.dta")134135136enaho_20_37 <- enaho_20_37 |> dplyr::select(conglome, vivienda, hogar, p710_04)137138enaho_20 <- enaho_20_34 |> dplyr::select(conglome, vivienda, hogar, estrato, ubigeo, gru51hd,139gashog2d,factor07) |>140left_join(enaho_20_37, by = c("conglome", "vivienda", "hogar")) |>141mutate(dep = substr(ubigeo, 1, 2), health_spend = (gru51hd/gashog2d)*100,142region = case_when(dep == "01" ~ "Amazonas",143dep == "02" ~ "Ancash",144dep == "03" ~ "Apurimac",145dep == "04" ~ "Arequipa",146dep == "05" ~ "Ayacucho",147dep == "06" ~ "Cajamarca",148dep == "07" ~ "Callao",149dep == "08" ~ "Cusco",150dep == "09" ~ "Huancavelica",151dep == "10" ~ "Huanuco",152dep == "11" ~ "Ica",153dep == "12" ~ "Junin",154dep == "13" ~ "La Libertad",155dep == "14" ~ "Lambayeque",156dep == "15" ~ "Lima",157dep == "16" ~ "Loreto",158dep == "17" ~ "Madre de Dios",159dep == "18" ~ "Moquegua",160dep == "19" ~ "Pasco",161dep == "20" ~ "Piura",162dep == "21" ~ "Puno",163dep == "22" ~ "San Martin",164dep == "23" ~ "Tacna",165dep == "24" ~ "Tumbes",166dep == "25" ~ "Ucayali"167)168169)170171survey_enaho <- enaho_20 %>% as_survey_design(ids = conglome, strata = estrato,172weight = factor07)173174# En este caso el factor de expansión es a nivel de hogares factor07175176attributes(survey_enaho)177178survey_enaho %>% group_by(region) %>%179summarise(180percent_juntos = survey_mean(p710_04, na.rm = T)*100, percent_health = survey_mean(health_spend, na.rm = T)181) -> table_ind182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222