Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/projects/project_3/starter-code/Project 3 - working file.ipynb
1904 views
Kernel: Python 3

Project 3

In this project, you will perform a logistic regression on admissions data

%matplotlib inline import matplotlib.pyplot as plt import pandas as pd import statsmodels.api as sm import pylab as pl import numpy as np
df = pd.read_csv("../assets/admissions.csv") df.head()
df.isnull().sum()
admit 0 gre 2 gpa 2 prestige 1 dtype: int64
df[df.isnull().any(axis=1)] #.isnull() returns true or falses, and .any() returns the row if any one of the cells are T or F. #fyi .all() also returns the row if all the cells = true or false
df.admit.value_counts()
0 273 1 127 Name: admit, dtype: int64

we are missing less than 1% of the data, it could be useful to try and fill in the missing values or do some analysis to determine why they are not there but for now i'm going to drop the na's

df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Data columns (total 4 columns): admit 400 non-null int64 gre 398 non-null float64 gpa 398 non-null float64 prestige 399 non-null float64 dtypes: float64(3), int64(1) memory usage: 12.6 KB
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 397 entries, 0 to 399 Data columns (total 4 columns): admit 397 non-null int64 gre 397 non-null float64 gpa 397 non-null float64 prestige 397 non-null float64 dtypes: float64(3), int64(1) memory usage: 15.5 KB

Part 1. Frequency Tables

1. Let's create a frequency table of our variables. Look at the documentation for pd.crosstab

df.columns
Index(['admit', 'gre', 'gpa', 'prestige'], dtype='object')
cnts = \ df[["admit", "prestige"] ].groupby(["admit", "prestige"] ).size( ).rename("count" ).to_frame() #.size() across multiple columns is effectively doing a .value_counts() across multiple columns
cnts
cnts_us = cnts.unstack()
cnts_us
cnts_us.columns
MultiIndex(levels=[['count'], [1.0, 2.0, 3.0, 4.0]], labels=[[0, 0, 0, 0], [0, 1, 2, 3]], names=[None, 'prestige'])
cnts_us.columns = cnts_us.columns.droplevel()
cnts_us.columns
Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64', name='prestige')
cnts_us.plot.barh(figsize=(11, 8))
<matplotlib.axes._subplots.AxesSubplot at 0xe43bcf8>
Image in a Jupyter notebook
cnts_us.plot.barh(stacked=True, figsize=(11, 8))
<matplotlib.axes._subplots.AxesSubplot at 0xe280c18>
Image in a Jupyter notebook
cnts["pcnt_of_total"] = cnts / cnts.sum() * 100
ax = cnts.pcnt_of_total.unstack().plot.barh(figsize=(11, 8)) t = ax.set_title("frequency (% of total)")
Image in a Jupyter notebook
cnts["count"].groupby(level=0).sum()
admit 0 271 1 126 Name: count, dtype: int64
cnts["count"].groupby(level=1).sum()
prestige 1.0 61 2.0 148 3.0 121 4.0 67 Name: count, dtype: int64
admit_groups = cnts["count"].groupby(level=0).sum()
admit_groups
admit 0 271 1 126 Name: count, dtype: int64
cnts.loc[0, "count"]
prestige 1.0 28 2.0 95 3.0 93 4.0 55 Name: count, dtype: int64
admit_groups.loc[0]
271
cnts.loc[0, "count"] / admit_groups.loc[0]
prestige 1.0 0.103321 2.0 0.350554 3.0 0.343173 4.0 0.202952 Name: count, dtype: float64
admit_0_pcnts = \ pd.concat({0: cnts.loc[0, "count"] / admit_groups.loc[0] * 100}, axis=0 ).rename("pcnt_of_admit" ).to_frame()
admit_0_pcnts.index = admit_0_pcnts.index.rename("admit", level=0)
admit_1_pcnts = \ pd.concat({1: cnts.loc[1, "count"] / admit_groups.loc[1] * 100}, axis=0 ).rename("pcnt_of_admit" ).to_frame()
admit_1_pcnts.index = admit_1_pcnts.index.rename("admit", level=0)
admit_pcnts = pd.concat([admit_0_pcnts, admit_1_pcnts])
admit_pcnts

the below is a for loop that does the above in one setp, but if ran, the join fails, so need to concat instead...

admit_pcnts = pd.DataFrame() for i in range(2): num = i new_df = pd.concat({num: cnts.loc[num, "count"] / admit_groups.loc[num] * 100}, axis=0 ).rename("pcnt_of_admit" ).to_frame() if num == 0: admit_pcnts = new_df admit_pcnts.index = admit_pcnts.index.rename("join", level=0) else: admit_pcnts = pd.concat([admit_pcnts, new_df])
admit_pcnts
#cnts = cnts.join(admit_pcnts) #this throws an error but concat below works
cnts = pd.concat([cnts, admit_pcnts], axis=1)

the default for concat is axis=0, which appends to the rows, instead use axis=1 to concat columns

cnts
ax = cnts.pcnt_of_admit.unstack().plot.barh(figsize=(11, 8)) t = ax.set_title("frequency (% of admit)")
Image in a Jupyter notebook

the below uses crosstab... not 100% on some of the syntax technicalities

prestige = pd.crosstab(index=df['prestige'], columns='count') prestige #pd.crosstab(index=df['prestige'], columns='count').sum()
print (prestige.sum(), "\n") # sum the counts print (prestige.shape, "\n") # check number of rows and co prestige.iloc[2:5] # slice rows 3-4
col_0 count 397 dtype: int64 (4, 1)
prestige/prestige.sum()
admit = pd.crosstab(index=df['admit'], columns="count") admit
comb = pd.crosstab(index=df['admit'], columns=df['prestige']) comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4'] comb.index= ['admit 0','admit 1'] comb
#you can get the marginal counts (totals for each row and column) by including the argument margins=True: comb = pd.crosstab(index=df['admit'], columns=df['prestige'], margins=True) comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4', 'rowtotal'] comb.index= ['admit 0','admit 1', 'coltotal'] comb
comb.sum()-comb.loc["coltotal"]
prestige 1 61 prestige 2 148 prestige 3 121 prestige 4 67 rowtotal 397 dtype: int64
#to get the total proportion of counts in each cell, divide the table by the grand total: comb/comb.loc["coltotal","rowtotal"]
#to get the proportion of counts along each column (in this case, the admittance rate within each prestige class) divide by the column totals: comb/comb.loc["coltotal"]
#to get the proportion of counts along each row divide by the row totals. #the division operator functions on a row-by-row basis when used on DataFrames by default. #in this case we want to divide each column by the rowtotals column. #to get division to work on a column by column basis, use df.div() with the axis set to 0 (or "index"): comb.div(comb["rowtotal"], axis=0)
#alternatively, you can transpose the table with df.T to swap rows and columns and perform row by row division as normal: comb.T/comb["rowtotal"]
#the crosstab() function lets you create tables out of more than two categories. #higher dimensional tables can be a little confusing to look at, #but they can also yield finer-grained insight into interactions between multiple variables: admit_gre_gpa_prestige = pd.crosstab(index=df['admit'], columns=[df['prestige'], df['gpa'], df['gre']], margins=True) # Include row and column totals admit_gre_gpa_prestige
#notice that by passing a second or third variable to the columns argument, #the resulting table has columns categorized by both gre, gpa and prestige. #the outermost index (prestige) returns sections of the table instead of individual columns: admit_gre_gpa_prestige[2] # Get the subtable under prestige 2

Part 2. Return of dummy variables

the below two cells are just notes for reference from a lesson on dummy variables...

# set a seed for reproducibility np.random.seed(12345) # create a Series of booleans in which roughly half are True nums = np.random.rand(len(data)) mask_large = nums > 0.5 # initially set Size to small, then change roughly half to be large data['size'] = 'small' data.loc[mask_large, 'size'] = 'large' data.head() # create a new series called new_ser data['new_ser'] = data['size'].map({'small':0, 'large':1}) data.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-58-efd39e2a6327> in <module>() 3 4 # create a Series of booleans in which roughly half are True ----> 5 nums = np.random.rand(len(data)) 6 mask_large = nums > 0.5 7 NameError: name 'data' is not defined
# set a seed for reproducibility np.random.seed(123456) # assign roughly one third of observations to each group nums = np.random.rand(len(data)) mask_suburban = (nums > 0.33) & (nums < 0.66) mask_urban = nums > 0.66 data['area'] = 'rural' data.loc[mask_suburban, 'area'] = 'suburban' data.loc[mask_urban, 'area'] = 'urban' data.head()

Part 2. Return of dummy variables

pd.get_dummies(df['prestige']).head()
pd.get_dummies(df['prestige']).sum(axis=1).describe() #.sum(axis=1), sums across the column. here we see that there is a 1 in at least one of the columns
count 397.0 mean 1.0 std 0.0 min 1.0 25% 1.0 50% 1.0 75% 1.0 max 1.0 dtype: float64
# one-hot encoding # create four dummy variables using get_dummies, then exclude the first dummy column dummies = pd.get_dummies(df['prestige'], prefix='prestige', drop_first=True) # UNLESS THE CAtegorical values are mutually exclusive and every row will take on eof these values, you absolutely do NOT drop one of the columns. # concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns) #df1 = pd.concat([df, dummies], axis=1) df1 = df.join(dummies) #defaults to left join df1.head()

2.1 Create class or dummy variables for prestige

prestige_dummies = pd.get_dummies(df['prestige']) prestige_dummies.head()
prestige_dummies.rename(columns={1.0: 'Prestige1', 2.0: 'Prestige2', 3.0: 'Prestige3', 4.0: 'Prestige4'}, inplace=True) prestige_dummies.head()

2.2 When modeling our class variables, how many do we need?

Answer:

3 dummies are needed.

when presented with a categorical variables for which every row must take one and exactly one value, you should drop one of the dummy columns so as to avoid redundancy in your exogenous variables (e.g. flip of a coin, you need either heads or tails as a column and do not need both). however if you have a categorical variable for which a row could take multiple or no values, then you leave all the columns there.

Part 3. Hand calculating odds ratios

Develop your intuition about expected outcomes by hand calculating odds ratios.

cols_to_keep = ['admit', 'gre', 'gpa'] handcalc = df[cols_to_keep].join(prestige_dummies) handcalc.head()
#discovery calcs: len(handcalc['admit']) len(handcalc[handcalc['admit']==0]) handcalc['admit'].sum() len(handcalc[handcalc['Prestige1']==0]) handcalc['Prestige1'].sum() handcalc['Prestige1'].value_counts()
0 336 1 61 Name: Prestige1, dtype: int64
#it's unclear why the below code returns the columns in the incorrect order, i.e. column 'prestige 4' is really 'prestige 1'? comb = pd.crosstab(index=handcalc['admit'], columns=[handcalc['Prestige1'],handcalc['Prestige2'],handcalc['Prestige3'],handcalc['Prestige4']], margins=True) comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4', 'rowtotal'] comb.index = ['admit 0','admit 1', 'coltotal'] comb
pd.crosstab(df['admit'], df['prestige'], rownames=['admit'])
comb = pd.crosstab(index=df['admit'], columns=df['prestige']) comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4'] comb.index = ['admit 0','admit 1'] comb
handcalc
handcalc.describe()
prestige_1 = pd.crosstab(index=handcalc['Prestige1'], columns='count') prestige_1
admit = pd.crosstab(index=handcalc['admit'], columns='count') admit
# crosstab 'prestige 1' admission, indexed by 'admit' # frequency table cutting prestige and whether or not someone was admitted comb1 = pd.crosstab(index=handcalc['admit'], columns=handcalc['Prestige1']) comb1.columns = ['not prestige 1', 'prestige 1'] #what determines the order for the column names? comb1.index = ['admit 0','admit 1'] comb1
# crosstab 'prestige 1' admission, indexed by 'prestige 1' # frequency table cutting prestige and whether or not someone was admitted comb2 = pd.crosstab(index=handcalc['Prestige1'], columns=handcalc['admit']) comb2.columns = ['admit 0','admit 1'] comb2.index = ['not prestige 1', 'prestige 1'] #what determines the order for the column names? comb2
comb3 = pd.crosstab(handcalc['admit'], handcalc['Prestige1'], rownames=['admit'], colnames=['Prestige1']) comb3
comb4 = pd.crosstab(handcalc['Prestige1'], handcalc['admit'], rownames=['Prestige1'], colnames=['admit']) comb4

3.1 Use the cross tab above to calculate the odds of being admitted to grad school if you attended a #1 ranked college

odds1 = comb4.iloc[1][1] / (comb4.iloc[1].sum() - comb4.iloc[1][1]) odds1
1.1785714285714286

odds ratio: 33:28

prob1 = 33/(33+28) prob1
0.5409836065573771
comb1
comb1['prestige 1']
admit 0 28 admit 1 33 Name: prestige 1, dtype: int64
comb1['not prestige 1']
admit 0 243 admit 1 93 Name: not prestige 1, dtype: int64
comb1
comb1['prestige 1']['admit 1']
33
comb1['prestige 1']['admit 0']
28

3.2 Now calculate the odds of admission if you did not attend a #1 ranked college

comb4
comb4.iloc[0, 1]
93
comb4.loc[0, 1] #if the index were named anything but 0, it would kick back an error
93
odds_n1 = comb4.iloc[0][1] / (comb4.iloc[0].sum() - comb4.iloc[0][1]) odds_n1
0.38271604938271603
prob4 = 93/(93+243) prob4
0.2767857142857143

3.3 Calculate the odds ratio

odds ratio: 93:243

3.4 Write this finding in a sentenance:

Answer:

we see that prestige plays a big role in admittance to grad school. if you did not attend a prestige 1 school, your odds (93:243) of getting admitted are severely hindered. non-prestige 1 attendance stands a 27% chance of admittance versus a 54% chance if you did attend a prestige 1 school.

3.5 Print the cross tab for prestige_4

comb5 = pd.crosstab(handcalc['Prestige4'], handcalc['admit'], rownames=['Prestige4'], colnames=['admit']) comb5

3.6 Calculate the Odds Ratio

12:55

odds4 = 12/(67-12) odds4
0.21818181818181817
prob4 = 12/(12+55) prob4
0.1791044776119403

3.7 Write this finding in a sentence

Answer:

we see that if you attended a prestige 4 school, your odds (12:55) of getting admitted are even more bleak. prestige 4 attendance stands a 18% chance of admittance versus a 54% chance if you did attend a prestige 1 school.

Part 4. Analysis

prestige_dummies.iloc[:, 1:] #the first section in [ , ] is rows, and the second section is columns
# create a clean data frame for the regression cols_to_keep = ['admit', 'gre', 'gpa'] # Dropping one of the dummy columns data = df[cols_to_keep].join(prestige_dummies.iloc[:, 1:]) #the first section in [ , ] is rows, and the second section is columns data.head()

if using statsmodel

We will add a constant term for our Logistic Regression.

The statsmodels function requires that intercepts/constants are specified explicitly.

make sure to come back to this with Abe.

#have not pulled in stats model for the regression... #manually add the intercept #data['intercept'] = 1.0

4.1 Create the X and Y variables

feature_cols = ['gre', 'gpa', 'Prestige2', 'Prestige3', 'Prestige4'] X = data[feature_cols] #create X (we are passing a list of arrays, so we don't need to use double [[]] to ensure it reads as a df) y = data['admit'] #create y

4.2 Fit the model -

  • Load sklearn's logistic regression

  • Create the regression object

  • Fit the model

#fitting a logistic regression model and storing the class predictions from sklearn.linear_model import LogisticRegression #load sklearn's logistic regression logreg = LogisticRegression() #create regression object/instantiate the model logreg.fit(X, y) #fit pred = logreg.predict(X) #predict logreg.score(X, y) #this returns the accuracy
0.7052896725440806

4.3 Print the coefficients

print (logreg.coef_) print (logreg.intercept_) #note, this is the internal intercept that gets fed into the logistic regression print (df.admit.mean())
[[ 0.00178497 0.23229458 -0.60347467 -1.17214957 -1.37729795]] [-1.81701706] 0.31738035264483627
  • if you throw 0 for all the y preds, you would be right 68% of the time

  • if you throw 1 for all the y preds, you would be right 32% of the time

  • that is not a very good model

df.admit.mean()
0.31738035264483627
admit_perc = 126 / (271+126) admit_perc
0.31738035264483627
print(pred) #this is the predicton, based on a default of .5 #if the model outputs .49 you are not getting in, if it outputs .51, you are getting in
[0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
proba = logreg.predict_proba(X) #predict
proba[:5] #the first one will always be the probability of not what you are trying to predict, and the second of what you are trying to predict
array([[0.81340581, 0.18659419], [0.72283424, 0.27716576], [0.36815534, 0.63184466], [0.78766596, 0.21233404], [0.82998297, 0.17001703]])
print(proba[:, 1])
[0.18659419 0.27716576 0.63184466 0.21233404 0.17001703 0.40925899 0.46874205 0.27070755 0.22481859 0.43526125 0.3021334 0.42956137 0.61508543 0.38804338 0.58943772 0.2086162 0.28842267 0.14780737 0.4696848 0.50797682 0.20419042 0.40150911 0.18720704 0.2245205 0.42905043 0.61328896 0.53201833 0.19823728 0.43039855 0.46889457 0.20548533 0.2985104 0.24445981 0.3470547 0.39054492 0.26933392 0.49330638 0.19983482 0.30982793 0.1917883 0.29759959 0.35114953 0.35028634 0.20952587 0.38034939 0.20315947 0.35859467 0.16632301 0.13790066 0.18287472 0.27885915 0.15685126 0.2515047 0.39004865 0.26207308 0.32319895 0.22294903 0.16411395 0.29762611 0.18720704 0.36005798 0.19411147 0.27007075 0.29295988 0.26411199 0.37388842 0.26259399 0.51405567 0.51884944 0.61713824 0.28544585 0.12126306 0.17507905 0.38792808 0.24829637 0.31543981 0.22986517 0.3470547 0.46795069 0.55449361 0.21905523 0.35419158 0.2893614 0.13704555 0.22090155 0.31001084 0.35932599 0.36792625 0.54844488 0.42232602 0.43012938 0.57779175 0.47837212 0.33079645 0.39095086 0.38488386 0.22543736 0.32422693 0.37707003 0.18149032 0.16103486 0.24516062 0.14899759 0.24786319 0.41949499 0.39898459 0.56449662 0.30224643 0.16874403 0.3181495 0.22010288 0.16077103 0.16115043 0.50048862 0.30746962 0.27775423 0.30286585 0.42387752 0.61549032 0.15394668 0.34898023 0.28019818 0.19798409 0.19711203 0.30945167 0.19072987 0.51906295 0.26802733 0.32836543 0.16226105 0.35952291 0.34750873 0.3553953 0.20081418 0.3239586 0.21970452 0.19447511 0.30778223 0.37191557 0.52138209 0.40970327 0.24468598 0.23642291 0.23792645 0.2027443 0.18033094 0.31563525 0.20422987 0.4294113 0.56776963 0.61768695 0.28468344 0.41026519 0.24046282 0.26399297 0.18322198 0.30247805 0.46789024 0.3937199 0.29894617 0.34922974 0.38578168 0.41482776 0.29772763 0.35009173 0.58943772 0.16026024 0.30401809 0.23730672 0.25402162 0.18849425 0.20221073 0.27547872 0.45121966 0.18515127 0.37028915 0.29241274 0.24376036 0.24806796 0.14748448 0.26807511 0.19773263 0.43983463 0.36248474 0.16060583 0.46101581 0.24030665 0.33009327 0.32086073 0.35545092 0.32325207 0.27421186 0.15681915 0.3673862 0.35733037 0.2501259 0.15058568 0.25446206 0.21699227 0.31167627 0.34482496 0.58943772 0.1774045 0.53931663 0.32867823 0.58083079 0.54204898 0.21564452 0.36073462 0.25019509 0.33544079 0.25699826 0.37225848 0.24321965 0.36900857 0.46247623 0.42465513 0.35733037 0.22039677 0.29974121 0.43568305 0.31970516 0.42095357 0.29091258 0.38173401 0.17815459 0.31714252 0.41559946 0.19864539 0.239348 0.27746986 0.16530806 0.60610292 0.3531296 0.34648476 0.16281007 0.2812499 0.27012841 0.49905264 0.25203392 0.36220173 0.46332677 0.34233239 0.39392407 0.28931148 0.28295847 0.27282711 0.22316117 0.20718678 0.36282398 0.1968998 0.25812074 0.25566981 0.32613149 0.27589409 0.33694182 0.40297908 0.38124319 0.28835717 0.21082468 0.27589409 0.23957108 0.17726134 0.19122092 0.254903 0.40059856 0.19093704 0.41082734 0.22239891 0.41236676 0.53302242 0.3170099 0.34135294 0.21545885 0.51246769 0.21692947 0.37448924 0.41892942 0.16981012 0.22884978 0.17566235 0.29993119 0.20514593 0.58878595 0.23925947 0.26219139 0.12790159 0.37463272 0.41417429 0.41405647 0.63022211 0.40905352 0.21315294 0.47916656 0.29188215 0.34443863 0.28614413 0.37483311 0.27391986 0.27392974 0.42991975 0.12574905 0.20500651 0.47084348 0.36127047 0.32529897 0.18071049 0.25723223 0.40374395 0.28184366 0.19493537 0.19361478 0.22700797 0.24662064 0.27711696 0.18607541 0.47721426 0.19175067 0.31695733 0.16761541 0.26000719 0.19250947 0.57510549 0.39226151 0.35148401 0.3522088 0.16503861 0.32319895 0.27345809 0.40619033 0.23275638 0.21039811 0.53779732 0.20928288 0.20264533 0.47258011 0.26795496 0.17486718 0.20197406 0.2003248 0.33751527 0.21707491 0.19895663 0.36093186 0.37695598 0.2837383 0.34259084 0.47526134 0.25242626 0.22633479 0.41257402 0.3592701 0.54792967 0.39857449 0.55898842 0.24371561 0.31243791 0.51008527 0.48939552 0.38288847 0.30995891 0.49077027 0.1844324 0.18430379 0.2612938 0.53679457 0.4777925 0.35873555 0.28864824 0.48973059 0.5181166 0.36752855 0.2003637 0.38447898 0.48417127 0.24563574 0.30376949 0.41991159 0.3439143 0.34496334 0.57205552 0.27786163 0.40955644 0.44943455 0.35326952 0.36778383 0.38633226 0.4294695 0.41554049 0.24360274 0.3910967 0.22422295 0.40501045 0.21697084 0.27121438 0.41991159 0.26608885]
from sklearn.metrics import precision_score as ps from sklearn.metrics import recall_score as rc from sklearn.metrics import confusion_matrix as cm
ps(y, pred) #this gives the precision #true postive / (true postive + false positive) #precision speaks to how careful ytour model is #the group you identified correctly divided by the total group you identified #this number should be as high as possible #you won't cover your outdoor furniture unless it's a crazy hurricane, you're only doing it when it really counts #e.g. very expensive advertising: banner on the highway you are going for recall, a very expensive dinner with high likely converting clients is going for precision, or is optiomizing for recall
0.6216216216216216
rc(y, pred) # this gives the recall #true postive / (true postive + false negative) #the group you identified correctly divided by the group total #this number should be as high as possible #for cases like cancer, you want to have a better recall and minimize the chance for false negatives
0.18253968253968253
cm(y, pred) #this gives the confusion matrix, which is easier to read with labels
array([[257, 14], [103, 23]])
[TN , FP] [FN , TP]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-124-606e139981b4> in <module>() ----> 1 [TN , FP] 2 [FN , TP] NameError: name 'TN' is not defined
23 / (14+23) #precision #23 is the true positive predicted correctly #14 is the false positive, precicted as admitted but they are not actually admitted
0.6216216216216216
23 / (23+103) #recall #23 is the true positive predicted correctly #103 is the false negative, precicted as not admitted but they are actually admitted
0.18253968253968253

4.4 Calculate the odds ratios of the coeffiencents

hint 1: np.exp(X)

(from original project)

hint 2: conf['OR'] = params

conf.columns = ['2.5%', '97.5%', 'OR']
  • odds = probability / (1 - probability) i.e. one specific outcome/the rest of the other outcomes

  • probability = odds / (1 + odds) i.e. one specific outcome/all outcomes

  • logistic regression, compresses the linear regression to fit between 0 and 1 (need to dive into the mecanics of this)

  • the np.exp(X) reverts it back (how and why?)

logreg.coef_ #this is a list of a list, which is why you need to index into it
array([[ 0.00178497, 0.23229458, -0.60347467, -1.17214957, -1.37729795]])
#logodds = logreg.intercept_ + logreg.coef_[0] * ??? #logodds
#this gives the odds ratio params = logreg.coef_[0] np.exp(params)
array([1.00178657, 1.26149128, 0.546908 , 0.3097005 , 0.25225925])
# Convert log odds to odds. odds = np.exp(params) odds
array([1.00178657, 1.26149128, 0.546908 , 0.3097005 , 0.25225925])
# Convert odds to probability. prob = odds/(1 + odds) prob
array([0.50044624, 0.5578139 , 0.35354915, 0.23646666, 0.20144331])

4.5 Interpret the OR of Prestige_2

Answer:

  • ppl who went to prestige 2 school, are 54% more likely to get admitted than prestige 1 students

  • bc prestige 1 is the base dummy variable

  • need to dig into this a bit more and underatand the inner workings better (?)

4.6 Interpret the OR of GPA

Answer:

  • for one unit increase in gpa you are 1.26149128 times likely to be admitted

Bonus

Plot the probability of being admitted into graduate school, stratified by GPA and GRE score.

(from original project - not part of current project)

Part 5: Predicted probablities

As a way of evaluating our classifier, we're going to recreate the dataset with every logical combination of input values. This will allow us to see how the predicted probability of admission increases/decreases across different variables. First we're going to generate the combinations using a helper function called cartesian (above).

We're going to use np.linspace to create a range of values for "gre" and "gpa". This creates a range of linearly spaced values from a specified min and maximum value--in our case just the min/max observed values.

def cartesian(arrays, out=None): """ Generate a cartesian product of input arrays. Parameters ---------- arrays : list of array-like 1-D arrays to form the cartesian product of. out : ndarray Array to place the cartesian product in. Returns ------- out : ndarray 2-D array of shape (M, len(arrays)) containing cartesian products formed of input arrays. Examples -------- >>> cartesian(([1, 2, 3], [4, 5], [6, 7])) array([[1, 4, 6], [1, 4, 7], [1, 5, 6], [1, 5, 7], [2, 4, 6], [2, 4, 7], [2, 5, 6], [2, 5, 7], [3, 4, 6], [3, 4, 7], [3, 5, 6], [3, 5, 7]]) """ arrays = [np.asarray(x) for x in arrays] dtype = arrays[0].dtype n = np.prod([x.size for x in arrays]) if out is None: out = np.zeros([n, len(arrays)], dtype=dtype) m = n / arrays[0].size out[:,0] = np.repeat(arrays[0], m) if arrays[1:]: cartesian(arrays[1:], out=out[0:m,1:]) for j in xrange(1, arrays[0].size): out[j*m:(j+1)*m,1:] = out[0:m,1:] return out
# instead of generating all possible values of GRE and GPA, we're going # to use an evenly spaced range of 10 values from the min to the max gres = np.linspace(data['gre'].min(), data['gre'].max(), 10) print (gres) # array([ 220. , 284.44444444, 348.88888889, 413.33333333, # 477.77777778, 542.22222222, 606.66666667, 671.11111111, # 735.55555556, 800. ]) gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10) print (gpas) # array([ 2.26 , 2.45333333, 2.64666667, 2.84 , 3.03333333, # 3.22666667, 3.42 , 3.61333333, 3.80666667, 4. ]) # enumerate all possibilities combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))

5.1 Recreate the dummy variables

# recreate the dummy variables # keep only what we need for making predictions

5.2 Make predictions on the enumerated dataset

5.3 Interpret findings for the last 4 observations