Path: blob/main/Trabajo_grupal/WG2/Solución/script_r.R
2835 views
1#######################################2" Homework 2 - solution "3" @author: Roberto Mendoza "4" @date: 19/09/2020 "5" @code: This code clean dataset from native Census"6#######################################789library(dplyr) # librería de limpieza de datos10library(tidyr)# librería de limpieza de datos11library(readxl) # lobreria para subir archivos excel, csv1213user <- Sys.getenv("USERNAME") # username1415setwd( paste0("C:/Users/",user,"/Documents/GitHub/1ECO35_2022_2/Trabajo_grupal/WG2/Solución") ) # set directorio1617junin_data = read_excel("../../../data/Region_Junin.xlsx")181920#1. ----------- Nombre de las variables ----------------------2122names(junin_data)2324#2. ---------- Type de variables y estadísticas ----------252627str(junin_data)28sapply(junin_data, class)29summary(junin_data) # main statistics of numeric variables3031# 3. ------- Missing values --------------3233sum(is.na(junin_data)) # total missing3435sapply( junin_data, function(x) sum(is.na(x)) ) # total missing by variable363738# 4. ------- Change variable's name --------------3940# %>% Ctrl + shift + m (shortcut)4142junin_data <- junin_data %>% rename(43comunidad = Place,44homxlee = men_not_read,45mujerxlee = women_not_read,46totalxlee = total_not_read4748)495051# 5. ------- unique values --------------5253unique(junin_data['comunidad'])5455unique(junin_data['District'])565758# 6. ------- Percentage variables --------------5960junin_data['total_pob'] = junin_data['peruvian_men'] + junin_data['peruvian_women']+6162junin_data['foreign_men'] + junin_data['foreign_women']636465junin_data['var_women_read'] = junin_data$mujerxlee/junin_data$totalxlee6667junin_data['var_men_read'] = junin_data$homxlee/junin_data$totalxlee686970# 7. ------- Dataset --------------7172junin_data <- junin_data[junin_data$District %in% c("CIUDAD DEL CERRO",73"JAUJA", "ACOLLA", "SAN GERÓNIMO", "TARMA",74"OROYA","CONCEPCIÓN"),]757677junin_data <- junin_data %>% filter( junin_data$natives > 0 & junin_data$mestizos > 0 )7879junin_data <- junin_data[,c('District','comunidad','total_pob','var_women_read','var_men_read')]8081write.csv(junin_data, '../../../data/Base_cleaned.csv')8283write.csv(junin_data, '../../../data/Base_cleaned.xlsx')84858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188