Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
AllenDowney
GitHub Repository: AllenDowney/bayesian-analysis-recipes
Path: blob/master/incubator/frac-regression-mlb.ipynb
411 views
Kernel: bayesian
import pandas as pd import numpy as np import matplotlib.pyplot as plt import pymc3 as pm %matplotlib inline
df = pd.read_csv('datasets/mlb_2013-2016.csv') df.head()
df.plot(kind='scatter', x='R', y='Winning %')
import seaborn as sns
# df.plot(kind='scatter', x='Season', y='Team Salary') sns.violinplot(x='Season', y='Team Salary', data=df)
df.columns
# Let's predict 'Winning %' from the rest of the columns. cols = list(df.columns) cols.remove('Season') cols.remove('Team') cols.remove('Team Salary (in millions)') cols.remove('League') cols.remove('Wins') cols.remove('Losses') banned = ['Winning %', 'Wins', 'Losses'] y_cols = ['Winning %'] feat_cols = list(set(cols) - set(y_cols) - set(banned))
from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(n_estimators=50) rfr.fit(df[feat_cols], df[y_cols]) plt.plot(rfr.feature_importances_)
important_cols = [] threshold = 0.03 impt_idxs = [i for i, k in enumerate(rfr.feature_importances_) if k > threshold] for i in impt_idxs: important_cols.append(feat_cols[i]) important_cols
import theano.tensor as tt with pm.Model() as model: weights = pm.Normal('weights', mu=0, sd=100**2, shape=(len(important_cols),)) perc_losses = tt.dot(weights, df[important_cols].T) # weights = pm.Normal('weights', mu=0, sd=100**2, shape=(2,)) # perc_losses = weights[0] * df['Team Salary'] + weights[1] * df['R'] alpha = 1 beta = 1 / perc_losses sd = pm.HalfCauchy('sd', beta=100) # like = pm.Beta('likelihood', alpha=alpha, beta=beta, observed=df[y_cols]) like = pm.Normal('likelihood', mu=perc_losses, sd=sd, observed=df[y_cols]) # like = pm.Beta('likelihood', mu=perc_losses, sd=sd, observed=df[y_cols]) # like = pm.Bernoulli('likelihood', p=perc_losses, observed=df[y_cols])
with model: trace = pm.sample(50000, step=pm.Metropolis())
pm.traceplot(trace)
burnin = 30000 pm.traceplot(trace[burnin:])