CoCalc -- R_script.ipynb

GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab10/R_script.ipynb
⁴⁵²⁵ views

Kernel: R

1.0 Linear regression, Plots and Coefplot

In [184]:

library('dplyr') # filter dataset
library(ggplot2) # plots
library(sandwich) # linear models
library(lmtest) # linear models 
library(xtable)
library(haven) # for read Stara files, spps and others 
library("stringr") # library to RE
library("readxl")

In [22]:

data <- read_dta("../../../data/Pesos/peso.dta")

In [23]:

head(data)

Out[23]:

In [24]:

# type of variables 

as.list(sapply(data, class))

Out[24]:

In [50]:

# Dummy variable 

data <- data %>%  mutate(Dummy = ifelse(cigs > 0 ,  1 , ifelse(!is.na(cigs),0, NA) ) ) %>%  
                        mutate(Dummy1 = case_when(Dummy == 0 ~ "No smoking",  # create this variables for plot section
                                          Dummy == 1 ~ "Smoking"))

attach(data)

Out[50]:

The following objects are masked from data (pos = 3):

    bwght, bwghtlbs, cigprice, cigs, cigtax, Dummy, Dummy1, faminc,
    fatheduc, lbwght, lfaminc, male, motheduc, packs, parity, white

The following objects are masked from data (pos = 4):

    bwght, bwghtlbs, cigprice, cigs, cigtax, Dummy, faminc, fatheduc,
    lbwght, lfaminc, male, motheduc, packs, parity, white

The following objects are masked from data (pos = 5):

    bwght, bwghtlbs, cigprice, cigs, cigtax, Dummy, faminc, fatheduc,
    lbwght, lfaminc, male, motheduc, packs, parity, white

In [132]:

theme_set(theme_bw(20))

data %>%  ggplot(aes(x=bwghtlbs, fill = Dummy1)) +
    geom_histogram( color="black", alpha=0.6, position = 'identity', size=0.6) +
    scale_fill_manual(values=c("#69b3a2", "#404080"), name=NULL) +  # for legend section
    labs(x = " ", y = "Absolute frequency", title = "Smoking status and newborn weight (lbs)", size = 15) + 
     theme(text=element_text(size =15), plot.title = element_text(hjust = 0.5)) +
     theme(legend.position = c(0.85,0.9))  +  theme(panel.grid.major = element_blank(), 
               panel.grid.minor = element_blank()) + # for legend position
     scale_x_continuous(limits = c(0,18), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0,300), expand = c(0, 0))

Out[132]:

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning message:
"Removed 4 rows containing missing values (geom_bar)."

In [131]:

theme_set(theme_bw(20))

data %>%  ggplot(aes(x=bwghtlbs, fill = Dummy1)) +
    geom_histogram( aes(y = ..density..), color="black", alpha=0.3, position = 'identity', size=0.6) +
    scale_fill_manual(values=c("blue", "red"), name=NULL) +  # for legend section
    labs(x = " ", y = "Relative frequency", title = "Smoking status and newborn weight (lbs)", size = 15) + 
     theme(text=element_text(size =15), plot.title = element_text(hjust = 0.5)) +
     theme(legend.position = c(0.85,0.9)) +  theme(panel.grid.major = element_blank(), 
               panel.grid.minor = element_blank()) + # drop square lines 
 scale_x_continuous(limits = c(0,18), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0,0.4), expand = c(0, 0))

Out[131]:

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning message:
"Removed 4 rows containing missing values (geom_bar)."

In [130]:

theme_set(theme_bw(20))

data %>% ggplot(aes(x=bwghtlbs, fill = Dummy1 , colour=Dummy1)) +
  geom_density(alpha=0.3, color = "black", size=0.6) +
ggtitle("Smoking status and newborn weight (lbs)") + 
theme(text=element_text(size =15), plot.title = element_text(hjust = 0.5)) +
labs(x = "", 
     y = "Kernel Density") +
scale_fill_manual(values=c("blue", "red"), name=NULL) +
     theme(legend.position = c(0.85,0.9)) +  theme(panel.grid.major = element_blank(), 
               panel.grid.minor = element_blank()) + # drop square lines 
 scale_x_continuous(limits = c(0,18), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0,0.4), expand = c(0, 0))

Out[130]:

1.2 Linear Regression

In [151]:


# First Model 

model1 <- lm(lbwght ~ Dummy)

model1.tab <- coeftest(model1, vcov=vcovHC(model1, type='HC'))

model1_coef <- model1.tab[2,1]

model1_coef_se = model1.tab[2,2]

# HC0: standar error robust aginst heterocedasticity

model1_lower = coefci(model1, df = Inf, vcov. = vcovHC, type = "HC")[2,1]
model1_upper = coefci(model1, df = Inf, vcov. = vcovHC, type = "HC")[2,2]


# Second Model 

model2 <- lm(lbwght ~ Dummy + motheduc)

model2.tab <- coeftest(model2, vcov=vcovHC(model2, type='HC'))

model2_coef <- model2.tab[2,1]

model2_coef_se = model2.tab[2,2]

model2_lower = coefci(model2, df = Inf, vcov. = vcovHC, type = "HC")[2,1]
model2_upper = coefci(model2, df = Inf, vcov. = vcovHC, type = "HC")[2,2]


# Third Model 

model3 <- lm(lbwght ~ Dummy + motheduc + lfaminc + white + Dummy:(motheduc + lfaminc + white))

model3.tab <- coeftest(model3, vcov=vcovHC(model3, type='HC'))

model3_coef <- model3.tab[2,1]

model3_coef_se = model3.tab[2,2]

model3_lower = coefci(model3, df = Inf, vcov. = vcovHC, type = "HC")[2,1]
model3_upper = coefci(model3, df = Inf, vcov. = vcovHC, type = "HC")[2,2]

In [153]:

table<- matrix(0, 3, 4)
table[1,1]<- model1_coef  
table[1,2]<- model1_coef_se   
table[2,1]<- model2_coef
table[2,2]<- model2_coef_se    
table[3,1]<- model3_coef
table[3,2]<- model3_coef_se  
table[1,3]<- model1_lower 
table[1,4]<- model1_upper 
table[2,3]<- model2_lower 
table[2,4]<- model2_upper
table[3,3]<- model3_lower 
table[3,4]<- model3_upper 

colnames(table)<- c("Estimate","se","lower_bound","upper_bound")
rownames(table)<- c("OLS baseline", "OLS with controls", "OLS interactive model")
tab<- xtable(table)
tab

Out[153]:

In [ ]:

1.3 Coefplot

In [181]:

theme_set(theme_bw(20))
options(repr.plot.width = 8, repr.plot.height =8)  # plot size 

ggplot(tab, aes(x=rownames(tab), y=Estimate)) +
    geom_point(size=4, color = 'red') + 
    geom_errorbar(aes(ymin=lower_bound, ymax=upper_bound) , width = 0.1,color="blue", size = 0.9) + 
  labs(x="", y="") + ggtitle("Smoking Coefficient (95% CI)")  +
    theme(text=element_text(size =15), plot.title = element_text(hjust = 0.5)) +
    theme(panel.grid.major = element_blank(), 
               panel.grid.minor = element_blank())  + geom_hline(yintercept=0, linetype="dashed", 
                color = "black", size=1) +
scale_x_discrete(limits = c("OLS baseline","OLS with controls", "OLS interactive model")) # order x - axis

Out[181]:

2.0 regular Expression

In [185]:

data <- read_excel("../../../data/Centro_salud/Centro_salud_mental.xls")

In [186]:

head(data)

Out[186]:

In [224]:

# equivalents expression to regex using apply 

data$Ins <- apply(data['Institución_ruc'], 1, function(x) gsub("[0-9]", '', x))

data$Ins <- apply(data['Institución_ruc'], 1, function(x) gsub("\\d", '', x))   # drop digist               
                  
data$Ruc <- apply(data['Institución_ruc'], 1, function(x) gsub("[^0-9]", '', x))
                  
data$Ruc <- apply(data['Institución_ruc'], 1, function(x) gsub("\\D", '', x)) # No drop digist

In [225]:

data

Out[225]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

1.0 Linear regression, Plots and Coefplot

1.2 Linear Regression

1.3 Coefplot

2.0 regular Expression

Product

Resources

Company