Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Trabajo_final/grupo2/Trabajo Final Python.ipynb
2714 views
Kernel: Python 3

Pregunta 1. Modelos Lineales Python

Grupo 2

Estudiantes:

Enrique Ríos

Angie Quispe

Amalia

Fabio Salas

# Primero se procede a importar los packages import pandas as pd import numpy as np import re # Los Plots de las librerías import matplotlib.pyplot as plt import seaborn as sns # Se importa linear models library import statsmodels.api as sm # linear regression utiliza todas las columnas de base de datos import statsmodels.formula.api as smf # linear regression usa una formula # Exportar la tabla en Latex !pip install pystout from pystout import pystout user = os.getlogin() # Username os.chdir(f"/Users/enriquerios/Desktop/PUCP 2022.2/R y Python/Trabajo Final (2.0)/") # Set directorio
Requirement already satisfied: pystout in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (0.0.7) Requirement already satisfied: numpy in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pystout) (1.20.1) Requirement already satisfied: pandas>=0.25 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pystout) (1.2.4) Requirement already satisfied: python-dateutil>=2.7.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.25->pystout) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.25->pystout) (2021.1) Requirement already satisfied: six>=1.5 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas>=0.25->pystout) (1.15.0)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-19-96738e3c08cc> in <module> 23 24 ---> 25 os.chdir(f"/Users/enriquerios/Desktop/PUCP 2022.2/R y Python/Trabajo Final (2.0)/") # Set directorio NameError: name 'os' is not defined
# Se procede a instalar statsmodels !pip install statsmodels
Requirement already satisfied: statsmodels in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (0.12.2) Requirement already satisfied: patsy>=0.5 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (0.5.1) Requirement already satisfied: pandas>=0.21 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.2.4) Requirement already satisfied: scipy>=1.1 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.6.2) Requirement already satisfied: numpy>=1.15 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.20.1) Requirement already satisfied: python-dateutil>=2.7.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2021.1) Requirement already satisfied: six in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from patsy>=0.5->statsmodels) (1.15.0)
# Se procede a leer la data del paper Economic Shocks and Civil Conflict en formato dta. data = pd.read_stata("/Users/enriquerios/Desktop/PUCP 2022.2/R y Python/Trabajo Final (2.0)/mss_repdata.dta", convert_categoricals=False) # convert_categoricals=False: No se lee las etiquetas de valor
data
# Se procede a describir la tabla y el tipo de variables data.describe() data.info() data.dtypes
<class 'pandas.core.frame.DataFrame'> Int64Index: 743 entries, 0 to 742 Columns: 200 entries, ccode to soc dtypes: datetime64[ns](1), float32(104), float64(78), int32(9), int8(5), object(3) memory usage: 813.4+ KB
ccode float64 year datetime64[ns] country_name object country_code object GPCP float32 ... fh_pol float64 S float32 W float32 WoverS float32 soc float32 Length: 200, dtype: object
# Para ello las variables de interés son las siguientes # NDVI_g es la Tasa de variación del índice de vegetación # tot_100 son los Términos de intercambio # trade_pGDP son el Porcentaje de las exportaciones respecto del PBI # land_crop es el Porcentaje de tierra cultivable en uso # va_agr es el Valor agregado del sector agrícola respecto del PBI # va_ind_manf es el Valor agregado del sector manufacturero al PBI # Seleccionamos las variables para las estadísticas descriptivas table1 = data.loc[:,["NDVI_g", "tot_100", "trade_pGDP", "land_crop", "va_agr", "va_ind_manf"]] table1 # selccionamos los estadísticos de interés: media, error estándar y cantidad de observaciones summary_table = table1.describe().loc[["mean","std","count"]] summary_table
# Ahora se procede a presentar la tabla de forma transpuesta summary_table = table1.describe().loc[["mean","std","count"]].T # .t permite tranponer el DataFrame summary_table
table1.columns
Index(['NDVI_g', 'tot_100', 'trade_pGDP', 'land_crop', 'va_agr', 'va_ind_manf'], dtype='object')
table1.columns new_names = ["Tasa de variación del índice de vegetación", "Términos de intercambio", "Porcentaje de las exportaciones respecto del PBI", "Porcentaje de tierra cultivable en uso", "Valor agregado del sector agrícola respecto del PBI", "Valor agregado del sector manufacturero al PBI"]
# Se procede a unir las listas bajo la estructura diccionario dict( zip( table1.columns, new_names) )
{'NDVI_g': 'Tasa de variación del índice de vegetación', 'tot_100': 'Términos de intercambio', 'trade_pGDP': 'Porcentaje de las exportaciones respecto del PBI', 'land_crop': 'Porcentaje de tierra cultivable en uso', 'va_agr': 'Valor agregado del sector agrícola respecto del PBI', 'va_ind_manf': 'Valor agregado del sector manufacturero al PBI'}
# Ahora se personaliza el resumen de la tabla index_nuevos_nombres = dict( zip( table1.columns, new_names) ) columns_nuevos_nombres = { "mean": "Mean", "std": "Standard Deviation", "count": "Observations", } # Rename rows (indexes) and columns summary_table.rename(index=index_nuevos_nombres, columns=columns_nuevos_nombres, inplace=True)
summary_table

Exportar Tabla en LaTex

# Luego se procede a exportar la tabla en formato Latex # \ permite escribir el código extenso en lineas diferentes summary_table.style.format(subset="Mean", precision=2).format(subset="Standard Deviation", precision=2).format(subset="Observations", precision=0).to_latex( "summary1.tex", caption="Descriptive Statistics", column_format = "lccc" # l: left, c:center ,) #Columns format summary_table.style.format(subset="Mean", precision=2).format(subset="Standard Deviation", precision=2).format(subset="Observations", precision=0)
File "<ipython-input-1-837ee459e0a7>", line 9 summary_table.style.format(subset="Mean", precision=2).format(subset="Standard Deviation", precision=2).format(subset="Observations", precision=0) ^ SyntaxError: invalid syntax

Ahora se procede a Replicar la Tabla 3 del Paper Miguel et al (2004)

import pandas as pd import numpy as np #plots library import matplotlib.pyplot as plt import seaborn as sns # import linear models library import statsmodels.api as sm import statsmodels.formula.api as smf !pip install statsmodels data = pd.read_stata("/Users/enriquerios/Desktop/PUCP 2022.2/R y Python/Trabajo Final (2.0)/mss_repdata.dta",)
Requirement already satisfied: statsmodels in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (0.12.2) Requirement already satisfied: scipy>=1.1 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.6.2) Requirement already satisfied: numpy>=1.15 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.20.1) Requirement already satisfied: pandas>=0.21 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.2.4) Requirement already satisfied: patsy>=0.5 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (0.5.1) Requirement already satisfied: python-dateutil>=2.7.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2021.1) Requirement already satisfied: six in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from patsy>=0.5->statsmodels) (1.15.0)
data
# Dummy por país para efectos fijos que nos servira más adelante data['Dummy'] = np.where(data['ccode'] > 0 , 1, 0) smf.ols('ccode ~ Dummy', data).fit(cov_type = 'HC1') # HC0: white , HC1: Hubert - White model1 = smf.ols('ccode ~ Dummy', data).fit(cov_type = 'HC1').summary() print(model1)
OLS Regression Results ============================================================================== Dep. Variable: ccode R-squared: 0.000 Model: OLS Adj. R-squared: 0.000 Method: Least Squares F-statistic: 6.254e+04 Date: Sun, 11 Dec 2022 Prob (F-statistic): 0.00 Time: 17:29:32 Log-Likelihood: -4018.6 No. Observations: 743 AIC: 8039. Df Residuals: 742 BIC: 8044. Df Model: 0 Covariance Type: HC1 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 248.0451 0.992 250.075 0.000 246.101 249.989 Dummy 248.0451 0.992 250.075 0.000 246.101 249.989 ============================================================================== Omnibus: 105.894 Durbin-Watson: 0.086 Prob(Omnibus): 0.000 Jarque-Bera (JB): 35.215 Skew: 0.284 Prob(JB): 2.26e-08 Kurtosis: 2.097 Cond. No. 2.57e+16 ============================================================================== Notes: [1] Standard Errors are heteroscedasticity robust (HC1) [2] The smallest eigenvalue is 2.25e-30. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
smf.ols('ccode ~ Dummy', data).fit(cov_type = 'HC1').summary2().tables[1]
# Primer Modelo # Se procede a replicar la tabla considerando dos modelos. MODEL 1 # GPCP_g es la Tasa de Variación de las lluvias en el periodo t # GPCP_g_l es la Tasa de Variación de las lluvias en el periodo t-1 # El primer modelo considera la variable de interés: any_prio // Variable Explicativa: GPCP_g model1 = smf.ols( 'ccode ~ Dummy + any_prio + GPCP_g ', data).fit(cov_type = 'HC1').summary2().tables[1] model1_coef = model1.iloc[1,0] # fila posición 1 y columan posición 0 model1_coef_se = model1.iloc[1,1] # fila posición 1 y columan posición 1 # HC1: standar error robust aginst heterocedasticity model1_lower = model1.iloc[1,4] model1_upper = model1.iloc[1,5]
#Segundo Modelo # Mientras que en el segundo modelo se considera la variable de interés: war_prio // Variable Explicativa: GPCP_g_l # Se muestra efectos fijos (country), Si country-time trends MODEL 2 # Se inclue a la tasa de variación de las lluvias del año siguiente (GPCP_g_fl) # Errores estandar robustas (Huber-White robust) # Termino de perturbación están clusterizados (agrupados) a nivel país model2 = smf.ols('ccode ~ Dummy + war_prio + GPCP_g_l' , data).fit(cov_type = 'HC1').summary2().tables[1] model2_coef = model2.iloc[1,0] model2_coef_se = model2.iloc[1,1] model2_lower = model2.iloc[1,4] model2_upper = model2.iloc[1,5]
# CoefPlot fig, ax = plt.subplots(figsize=(7, 6)) ax.scatter(x=table_pandas['Model'], marker='o', s=20, # s: modificar tamaño del point y=table_pandas['Estimate'], color = "black") eb1 = plt.errorbar(x=table_pandas['Model'], y=table_pandas['Estimate'], yerr = 0.5*(table_pandas['Upper_bound']- table_pandas['Lower_bound']), color = 'black', ls='', capsize = 4) # ls='': no une los puntos rojos # yerr genera el gráfico del intervalo de confianza plt.axhline(y=0, color = 'black').set_linestyle('--') # linea horizontal # Set title & labels plt.title('Smoking Coefficient (95% CI)',fontsize=12)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Model' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-26-5e166e886f06> in <module> 2 fig, ax = plt.subplots(figsize=(7, 6)) 3 ----> 4 ax.scatter(x=table_pandas['Model'], 5 marker='o', s=20, # s: modificar tamaño del point 6 y=table_pandas['Estimate'], color = "black") ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key) 3022 if self.columns.nlevels > 1: 3023 return self._getitem_multilevel(key) -> 3024 indexer = self.columns.get_loc(key) 3025 if is_integer(indexer): 3026 indexer = [indexer] ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: 'Model'
Image in a Jupyter notebook
# Si el sm function es (homocedasdico) ols_model = sm.OLS(y, X).fit() ols_model.summary().tables[0] ols_model.summary().tables[1] ols_model.summary().tables[2] print(ols_model.summary())
control_formula = "any_prio ~ GPCP_g + GPCP_g_l" ols_model = smf.ols(control_formula, data=data).fit() print(ols_model.summary())
# Modelo con Efectos Fijos # Si efectos fijos (country), Si country-time trends # errores estandar robustos (Huber-White robust) # termino de perturbación están clusterizados (agrupados) a nivel país # No se añade variables de control formula_model1 = "any_prio ~ GPCP_g + GPCP_g_l + C(ccode)" + ' + ' + ' + '.join( country_trend ) ols_model1 = smf.ols(formula_model1, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['ccode']}) print(ols_model1.summary()) rmse_ol1 = round(mean_squared_error( y, ols_model1.predict())**0.5,2) print(rmse_ol1) # Model OLS: Si efectos fijos (country), Si country-time trends Se inclue a la tasa de variación de las lluvias del año siguiente (GPCP_g_fl) errores estandar robustas (Huber-White robust) termino de perturbación están clusterizados (agrupados) a nivel país formula_model2 = "war_prio ~ GPCP_g + GPCP_g_l + GPCP_g_fl + C(ccode)" + ' + ' + ' + '.join( country_trend ) ols_model2 = smf.ols(formula_model2, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['ccode']}) print(ols_model2.summary()) rmse_ol2 = round(mean_squared_error( y, ols_model2.predict())**0.5,2) print(rmse_ol2) # En caso que quisieramos eliminar los terminos de perturbación, debemos borrar los missing presenten en algunas de nuestras variables # En este caso tot_100_g presente missings. Por ello, primero borramos los missings de esa columna. data_na = data.dropna(subset = 'tot_100_g') formula_model3 = "war_prio ~ GPCP_g + GPCP_g_l + tot_100_g + C(ccode)" + ' + ' + ' + '.join( country_trend ) ols_model3 = smf.ols(formula_model3, data=data_na).fit(cov_type='cluster', cov_kwds={'groups': data_na['ccode']}) print(ols_model3.summary()) y_na = data_na['gdp_g'] # Es el Nuevo vector de la variable Y rmse_ol3 = round(mean_squared_error( y_na, ols_model3.predict())**0.5,2) print(rmse_ol3)

Exportar Tabla 3

# Lista de explicativa a mostrarse en la tabla explicativas = ['GPCP_g','GPCP_g_l','GPCP_g_fl','tot_100_g','y_0', 'polity2l', 'ethfrac', 'relfrac', 'Oil', 'lmtnest','lpopl1'] # etiquetas a las variables etiquetas = ['Growth in rainfall, t','Growth in rainfall, t-1'] labels = dict(zip(explicativas,etiquetas)) labels pystout(models = [ols_model1,ols_model2,], file='regression_table_2.tex', digits=3, endog_names=['OLS','OLS','OLS'], exogvars =explicativas , # sellecionamos las variables varlabels = labels, # etiquetas a las variables mgroups={'Dependent Variable: Civil Conflict 25 Deaths':[1,5]}, # titulo a las regresiones modstat={'nobs':'Observarions','rsquared':'R\sym{2}'}, # estadísticos addrows={'Country fixed effects':['yes','yes','yes'], 'Country-specific time trends' : ['yes','yes','yes'], 'Root mean square error': [rmse_ol1,rmse_ol2,rmse_ol3]}, # añadimos filas addnotes=['Note.—Huber robust standard errors are in parentheses.', 'Regression disturbance terms are clustered at the country level.', 'A country-specific year time trend is included in all specifications (coefficient estimates not reported).', '* Significantly different from zero at 90 percent confidence.', '** Significantly different from zero at 95 percent confidence.', '*** Significantly different from zero at 99 percent confidence.'], title='Rainfall and Civil Conflict (Reduced-Form)', stars={.1:'*',.05:'**',.01:'***'} )

Graficar Coeft Plot del Coeficiente Estimado de GPCP_g

#Se instalan los packages necesarios import pandas as pd import numpy as np #plots library import matplotlib.pyplot as plt import seaborn as sns # import linear models library import statsmodels.api as sm import statsmodels.formula.api as smf
# Se instala statsmodels !pip install statsmodels
Requirement already satisfied: statsmodels in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (0.12.2) Requirement already satisfied: scipy>=1.1 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.6.2) Requirement already satisfied: patsy>=0.5 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (0.5.1) Requirement already satisfied: numpy>=1.15 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.20.1) Requirement already satisfied: pandas>=0.21 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from statsmodels) (1.2.4) Requirement already satisfied: python-dateutil>=2.7.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.21->statsmodels) (2021.1) Requirement already satisfied: six in /Users/enriquerios/opt/anaconda3/lib/python3.8/site-packages (from patsy>=0.5->statsmodels) (1.15.0)
# Se procede a leer la data del paper Economic Shocks and Civil Conflict en formato dta. data = pd.read_stata("/Users/enriquerios/Desktop/PUCP 2022.2/R y Python/Trabajo Final (2.0)/mss_repdata.dta", convert_categoricals=False) # convert_categoricals=False: No se lee las etiquetas de valor
data
# First Model model1 = smf.ols('lbwght ~ Dummy', data).fit(cov_type = 'HC1').summary2().tables[1] model1_coef = model1.iloc[1,0] # fila posición 1 y columan posición 0 model1_coef_se = model1.iloc[1,1] # fila posición 1 y columan posición 1 # HC1: standar error robust aginst heterocedasticity model1_lower = model1.iloc[1,4] model1_upper = model1.iloc[1,5] # Second Model model2 = smf.ols('lbwght ~ Dummy + motheduc', data).fit(cov_type = 'HC1').summary2().tables[1] model2_coef = model2.iloc[1,0] model2_coef_se = model2.iloc[1,1] model2_lower = model2.iloc[1,4] model2_upper = model2.iloc[1,5] able = np.zeros( ( 3, 4 ) ) table[0,0] = model1_coef table[0,1] = model1_coef_se table[0,2] = model1_lower table[0,3] = model1_upper table[1,0] = model2_coef table[1,1] = model2_coef_se table[1,2] = model2_lower table[1,3] = model2_upper table[2,0] = model3_coef table[2,1] = model3_coef_se table[2,2] = model3_lower table[2,3] = model3_upper table_pandas = pd.DataFrame( table, columns = [ "Estimate","Std. Error","Lower_bound" , "Upper_bound"]) table_pandas.index = [ "OLS baseline","OLS with controls", "OLS interactive model"] table_pandas.reset_index(inplace = True) table_pandas.rename(columns = {"index" : "Model"}, inplace = True) table_pandas.round(8) fig, ax = plt.subplots(figsize=(7, 6)) ax.scatter(x=table_pandas['Model'], marker='o', s=20, # s: modificar tamaño del point y=table_pandas['Estimate'], color = "black") eb1 = plt.errorbar(x=table_pandas['Model'], y=table_pandas['Estimate'], yerr = 0.5*(table_pandas['Upper_bound']- table_pandas['Lower_bound']), color = 'black', ls='', capsize = 4) # ls='': no une los puntos rojos # yerr genera el gráfico del intervalo de confianza plt.axhline(y=0, color = 'black').set_linestyle('--') # linea horizontal # Set title & labels plt.title('Smoking Coefficient (95% CI)',fontsize=12)