CoCalc -- Grupo5_Pregunta1

GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Trabajo_grupal/WG5/Grupo5_Pregunta1_Tarea5.ipynb
⁴⁶⁸² views

Kernel: Python 3

In [23]:

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import statistics
import inspect 
from scipy.stats import t # t - student 
import os
import pyreadr
user = os.getlogin()   # Username
print(user)

Out[23]:

Usuario

In [24]:

os.chdir(f"C:/Users/{user}/Documents/GitHub/1ECO35_2022_2/Lab6")
cps2012_env = pyreadr.read_r("../data/cps2012.Rdata")
#cps2012_env = cps2012_env['data'] #Pasamos a data frame

In [36]:

#Para crear W hay que hacer algo con listas
#W = ['lnw','female','widowed', 'divorced', 'separated', 'nevermarried', 'hsd08', 'hsd911', 'hsg', 'cg', 'ad', 'mw', 'so', 'we', 'exp1', 'exp2', 'exp3', 'exp4', 'weight', 'ne', 'sc']
#W = [1.2.3]

class OLS(object):
    #Definimos X como data frame, Y como serie, W como lista para seleccionar las variables de X y el booleano para errores robustos.
    
   # __slots__ = ['__X',  'Y',  'W', 'robust_sd', 'n', 'X1',  'Y1',  'beta', 'nk', 'y_est'. 'error', 'Var', 'sd', 'límite_inferior', 'límite_superior', 'SCR', 'rmse', 'R2']
    
    def __init__(self, __X:pd.DataFrame ,Y:pd.Series , W, robust_sd=False):
        

        self.__X = X
        self.__X[ 'Intercept' ] = 1 #Puede generar un error en el data frame
        self.Y = Y
        self.robust_sd = robust_sd
        self.W = W  #['lnw','female','widowed', 'divorced', 'separated', 'nevermarried', 'hsd08', 'hsd911', 'hsg', 'cg', 'ad', 'mw', 'so', 'we', 'exp1', 'exp2', 'exp3', 'exp4', 'weight', 'ne', 'sc']
        #Método 1
    def Determinarcoeficientes(self): #Método 1
        
        #self.W = [self.X.columns.tolist()] # nombre de la base de datos como objeto lista
        
        self.n = self.__X.shape[0] # numero de observaciones, # self.n "Se crea un nuevo atributo"
        k = self.__X.shape[1] + 1 #numero de variables y el intercepto
        self.X1 = self.__X.to_numpy()  # self.X.to_numpy()  # DataFrame to numpy #np.column_stack((np.ones(self.n ), self.X.to_numpy() ))
        self.Y1 = self.Y.to_numpy().reshape(self.n  ,1)  #reshape(-1  ,1)
        
        self.beta = np.linalg.inv(self.X1.T @ self.X1) @ ((self.X1.T) @ self.Y1 )
        self.nk = self.n - k 
        
        #Método 2
    def Errorvarcovintcof(self):              
        
        if self.robust_sd:
            
            self.y_est =  self.X1 @ self.beta
            self.error = self.Y1 - self.y_est 
            sigma_1 =  sum(list( map( lambda x: x**2 , self.error)   )) / self.nk 
            self.Var = sigma_1*np.linalg.inv(self.__X.T @ self.__X) #Matríz de varianzas y covarianzas caso no robusto
            self.sd = np.sqrt( np.diag(self.Var) ) #Desviación estandar o errores estandar
            #self.sd_1 = self.sd.to_numpy()
            self.límite_inferior = self.beta-1.96*self.sd #Tenemos problemas generando el límite inferior y superior
            self.límite_superior = self.beta+1.96*self.sd
        
            #Método 3
        else:
            
            self.y_est =  self.X1 @ self.beta
            self.error = self.Y1 - self.y_est
            matrix_robust = np.diag(list( map( lambda x: x**2 , self.error)))  
            self.Var = np.linalg.inv(self.__X.T @ self.__X) @ self.__X.T @ matrix_robust @ self.__X @ np.linalg.inv(self.__X.T @ self.__X)
            self.sd = np.sqrt( np.diag(self.Var) )
            #self.sd_1 = self.sd.to_numpy()
            self.límite_inferior = self.beta-1.96*self.sd
            self.límite_superior = self.beta+1.96*self.sd

        #Método 4
    def R2yMSE(self):
        
        self.Determinarcoeficientes()  # run function
        self.Errorvarcovintcof()
        
        #SCR = sum(list( map( lambda x: x**2 , self.error)))
        #SCT = sum(list( map( lambda x: x**2 , self.Y - np.mean(self.y_est))))
        #R2 = 1 - self.SCR/self.SCT

        #y_est =  self.y_est
        #error = self.error
        self.SCR = np.sum(np.square(self.error))
        self.rmse = (self.SCR/self.n)**0.5
        SCT = np.sum(np.square(self.Y1 - np.mean(self.Y1))) 

        self.R2 = 1 - self.SCR/SCT

        return self.R2           

        #Método 5
    def __Table(self, **Kargs):
        
        #W = ['lnw','female','widowed', 'divorced', 'separated', 'nevermarried', 'hsd08', 'hsd911', 'hsg', 'cg', 'ad', 'mw', 'so', 'we', 'exp1', 'exp2', 'exp3', 'exp4', 'weight', 'ne', 'sc']
        # Lo agregamos, pero no lo usamos. Lo lamento Roberto, se nos acababa el tiempo. 
        
        self.R2yMSE()
        self.Determinarcoeficientes()
        self.Errorvarcovintcof()
        
        r2= self.R2
        scr = self.SCR
        sigma =  scr / self.nk
        Var = self.Var #sigma*np.linalg.inv(self.X1.T @ self.X1)
        sd = self.sd #np.sqrt( np.diag(Var) )
        #lower_bound = self.límite_inferior
        #upper_bound = self.límite_superior
        rmse = self.rmse #(scr/self.n)**0.5
    
        if (Kargs['Output'] == "DataFrame"):

               resultados_df = {"OLS": self.beta.flatten() ,
                                "standar_error": self.sd.flatten() ,
                                "Lower_bound": self.límite_inferior.flatten() ,
                                "Upper_bound": self.límite_superior.flatten() 
                    } 
               
               index_names = self.W
                                             #['Resultados'] +
               
               ResultadosdeOLS = pd.DataFrame(resultados_df, index = index_names)
    
           #self.beta.flatten() # multy-array a simple array 
            
        elif (Kargs['Output'] == "Diccionario"):

            df = pd.DataFrame({"OLS": self.beta.flatten() , "standar_error": self.sd.flatten() , "Lower_bound": self.límite_inferior.flatten() , "Upper_bound":self.límite_superior.flatten() }) 
            
            df_1 = { "Root_MSE":rmse.flatten() } #R2"R2": R2.flatten()
            
            df_2 = { "R2": r2.flatten() } 
            
            ResultadosdeOLS = pd.DataFrame(df, df_1, df_2)

        return ResultadosdeOLS

In [41]:

cps2012_env  # es un diccionario. En la llave "data" está la base de datos 
cps2012 = cps2012_env[ 'data' ] # extrae información almacenada en la llave data del diccionario cps2012_env
dt = cps2012.describe()
cps2012.shape

variance_cols = cps2012.var().to_numpy() # to numpy

Dataset = cps2012.iloc[ : ,  np.where( variance_cols != 0   )[0] ]
__X = Dataset.iloc[:,1:10]
Y = Dataset[['lnw']]
W = X.columns.tolist()
Reg1 = OLS(X, Y, W, robust_sd=True)

In [42]:

A = OLS(__X, Y, W)

In [44]:

A.__X

Out[44]:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-44-ace235b425df> in <module>
----> 1 A.__X

AttributeError: 'OLS' object has no attribute '__X'

In [46]:

A.__Table('Output') = "DataFrame"

Out[46]:

  File "<ipython-input-46-0da80f1c6148>", line 1
    A.__Table('Output') = "DataFrame"
    ^
SyntaxError: cannot assign to function call

Product

Resources

Company