CoCalc -- script

GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Trabajo_grupal/WG4/Solucion/script_py.py
⁴⁵⁸⁷ views
1
# -*- coding: utf-8 -*-
2
"""
3
Estructura de Clase OLS
4

5
@author: Roberto
6
"""
7

8

9
import pandas as pd
10
import numpy as np
11
import pyreadr  # Load R dataset
12
import os # for usernanme y set direcotrio
13

14

15

16
class RegClass( object ):
17
    
18
    def __init__( self, X : pd.DataFrame , y : pd.Series , select_vars , sd_robust  ):
19
    
20
    
21
        if not isinstance( X, pd.DataFrame ):
22
            raise TypeError( "X must be a pd.DataFrame." )
23

24
        if not isinstance( y , pd.Series ):
25
            raise TypeError( "y must be a pd.Series." )
26
            
27
        
28
        # asignamos los atributos
29
        
30
        self.X = X 
31
        self.y = y
32
        self.select_vars = select_vars
33
        self.sd_robust = sd_robust
34
        
35
        # Try: ejecuta la seleccion de variables usando .loc (filtro de columnas por nombres)
36
        # Entonces, si select_var es una lista de variables, entonces se ejecuta la linea 1
37
        # Por otro lado, soi select_vars es una lista de posiciones de columnas, entonces 
38
        # la linea 1 no podrá ejecutarse y se ejecutará la linea 2. 
39
        
40
        try:
41
            
42
            self.X = self.X.loc[ : , self.select_vars ]  # 1)
43
        
44
        except Exception:
45
        
46
             self.X = self.X.iloc[:, self.select_vars]   # 2)
47
             
48
    
49
        
50
        self.X[ 'Intercept' ] = 1
51

52
        cols = self.X.columns.tolist() 
53
        new_cols_orders = [cols[ -1 ]] + cols[ 0:-1 ] 
54
        self.X = self.X.loc[ : , new_cols_orders ]  
55

56
        self.X_np = self.X.values
57
        self.y_np = y.values.reshape( -1 , 1 )
58
        self.columns = self.X.columns.tolist()
59
        
60
    # método que estima el vector de coeficientes
61
    
62
    def coefficients(self):
63
        
64
            X_np = self.X_np
65
            y_np = self.y_np
66
    
67
            # beta_ols
68
            beta_ols = np.linalg.inv( X_np.T @ X_np ) @ ( X_np.T @ y_np )
69
            
70
            self.beta_OLS = beta_ols
71
        
72
    def output1(self):
73
            
74
            self.coefficients()
75
            beta_OLS  = self.beta_OLS
76

77
            X_np = self.X_np
78
            y_np = self.y_np
79

80
            
81
            # errors
82
            e = y_np - ( X_np @ beta_OLS )
83
            
84
            # error variance
85
            N = X.shape[ 0 ]
86
            total_parameters = X.shape[ 1 ]
87
            error_var = ( (e.T @ e)[ 0 ] )/( N - total_parameters )
88
            
89
            # Matriz de Varianza y covarianza
90
            
91
            var_OLS =  error_var * np.linalg.inv( X_np.T @ X_np )
92
            
93
            # standard errors
94
            
95
            self.beta_se = np.sqrt( np.diag( var_OLS ) )
96
            
97
            # intervalo de confianza
98
            
99
            self.up_bd = beta_OLS.ravel() + 1.96*self.beta_se
100
            
101
            self.lw_bd = beta_OLS.ravel() - 1.96*self.beta_se
102
            
103
            
104
            
105
    def output2(self):
106
            
107
            self.coefficients()
108

109
            X_np = self.X_np
110
            y_np = self.y_np
111
            beta_OLS  = self.beta_OLS
112
            
113
            
114
            # errors
115
            y_est = X_np @ beta_OLS
116
            
117
            e = y_np - y_est
118
            
119
            e_square = np.array( list( map( lambda x: x**2 , e)) )
120
            
121
            # Matrix de white 
122
            
123
            matrix_robust = np.diag(list(e_square.flatten()))
124
            
125
            # Matrix de varianza y covarainza robusta ante heterocedasticidad
126
            
127
            Var_robust = np.linalg.inv(X_np.T @ X_np) @ X_np.T @ matrix_robust @ X_np @ np.linalg.inv(X_np.T @ X_np)
128
                      
129
            # standard errors
130
            
131
            self.beta_se = np.sqrt( np.diag( Var_robust ) )
132
            
133
            self.up_bd = beta_OLS.ravel() + 1.96*self.beta_se
134
            
135
            self.lw_bd = beta_OLS.ravel() - 1.96*self.beta_se
136
            
137
    def output3(self):
138
            
139
            self.coefficients()
140
            
141
            y_np = self.y_np
142
            X_np = self.X_np
143
            N = X_np.shape[ 0 ]
144
            y_est = X_np @ self.beta_OLS 
145
            
146
            SCR = sum(list( map( lambda x: x**2 , y_np - y_est)   ))
147
            SCT = sum(list( map( lambda x: x**2 , y_np - np.mean(y_est)   )))
148
            self.R2 = 1-SCR/SCT
149
            self.rmse = (SCR/N)**0.5
150
            
151
    
152
    def table(self):
153
            
154

155
            self.output3()  # ejecutamos el metodo que calcula el R2 y root-mse
156
            self.coefficients()  # ejectuamos el metodo que estima el vector de coeficientes
157
            
158
            
159
            if self.sd_robust == True : 
160
                
161
                self.output2()  # ejecutamos el metodo de errors estandar corregido ante heteroce.
162
            
163
                table_data ={  'Coef.'    : self.beta_OLS.ravel() , 
164
                   "Std.Err." : self.beta_se.ravel(),
165
                   "Lower_bound"   : self.lw_bd.ravel(),
166
                   "Upper_bound"   : self.up_bd.ravel()
167
                }
168
                
169
                
170
                # defining index names
171
                index_names = self.columns
172
    
173
                # defining a pandas dataframe 
174
                reg_OLS = pd.DataFrame( table_data , index = index_names )
175
                
176
                # output
177
                
178
                
179
                table_dict = {"Table":reg_OLS, "R2": self.R2, "MSE": self.rmse}
180

181
            else:
182
                
183
                self.output1() # ejecutamos el metodo de errors estandar sin corrección ante heterocedasticidad
184
                
185
                table_data ={  'Coef.'    : self.beta_OLS.ravel() , 
186
                   "Std.Err." : self.beta_se.ravel(),
187
                   "Lower_bound"   : self.lw_bd.ravel(),
188
                   "Upper_bound"   : self.up_bd.ravel()
189
                }
190
                
191
                
192
                # defining index names
193
                index_names = self.columns
194

195
                # defining a pandas dataframe 
196
                reg_OLS = pd.DataFrame( table_data , index = index_names )
197
                
198
                # output
199
                
200
                
201
                table_dict = {"Table":reg_OLS, "R2": self.R2, "MSE": self.rmse}
202
                
203
                
204
                
205

206
            return table_dict
207
        
208
        
209
    
210

211
user = os.getlogin()   # Username
212

213
# Set directorio
214

215
os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Trabajo_grupal/WG4/Solucion") # Set directorio
216

217
cps2012_env = pyreadr.read_r("../../../data/cps2012.Rdata") # output formato diccionario
218

219

220
cps2012 = cps2012_env[ 'data' ].iloc[0:500]  # seleccionamo solo 500 observaciones
221

222
       
223
Y = cps2012['lnw']
224
X = cps2012.iloc[:,np.arange(1,18)]   # No consideramos el logaritmo del salario 
225

226

227
Regression1 = RegClass(X, Y , ["female","hsg","cg","exp1","exp2"] , sd_robust = True)
228
Regression1.table()
229

230

231
Regression2 = RegClass(X, Y , ["female","hsg","cg","exp1","exp2"] , sd_robust = False)
232
Regression2.table()
233

234

235
Regression3 = RegClass(X, Y , [1,2,5,8,10] , sd_robust = False)
236
Regression3.table()
237

238

239
Regression4 = RegClass(X, Y , [1,2,5,8,10] , sd_robust = True)
240
Regression4.table()
241

242

243

244

245

246

247

248

249

250

251

252
Product

Resources

Company