Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Project: Data 150
Views: 74
import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score from mpl_toolkits.mplot3d import Axes3D df_tips = sns.load_dataset("tips") X = pd.DataFrame(df_tips, columns = ['tip']) print(df_tips.head())
total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 2 21.01 3.50 Male No Sun Dinner 3 3 23.68 3.31 Male No Sun Dinner 2 4 24.59 3.61 Female No Sun Dinner 4
<class 'pandas.core.series.Series'>
tipperc = df_tips['tip'] / df_tips['total_bill'] print(tipperc)
0 0.059447 1 0.160542 2 0.166587 3 0.139780 4 0.146808 5 0.186240 6 0.228050 7 0.116071 8 0.130319 9 0.218539 10 0.166504 11 0.141804 12 0.101816 13 0.162778 14 0.203641 15 0.181650 16 0.161665 17 0.227747 18 0.206246 19 0.162228 20 0.227679 21 0.135535 22 0.141408 23 0.192288 24 0.160444 25 0.131387 26 0.149589 27 0.157604 28 0.198157 29 0.152672 ... 214 0.230742 215 0.085271 216 0.106572 217 0.129422 218 0.186047 219 0.102522 220 0.180921 221 0.259314 222 0.223776 223 0.187735 224 0.117735 225 0.153657 226 0.198216 227 0.146699 228 0.204819 229 0.130199 230 0.083299 231 0.191205 232 0.291990 233 0.136490 234 0.193175 235 0.124131 236 0.079365 237 0.035638 238 0.130338 239 0.203927 240 0.073584 241 0.088222 242 0.098204 243 0.159744 Length: 244, dtype: float64
#tippera = np.array(tipperc) newX = pd.DataFrame([tipperc], columns = 'tip_perc') print(newX)
colmns.extend((''))
X = pd.DataFrame(df_tips, columns = ['total bill', 'sex' , 'smoker' , 'day' , 'time' , 'size']) y = pd.DataFrame(df_tips, columns = ['tip']) print(df_tips.head)
# use dummy variables for smoker new_df_tips = pd.get_dummies(df_tips, columns = ['smoker']) #print(list(new_df_tips.columns.values)) feature_cols = ['smoker_Yes','smoker_No'] X = pd.DataFrame(new_df_tips, columns = feature_cols) print(X) y = pd.DataFrame(df_tips, columns = ['tip']) #print(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) regModel = LinearRegression() regModel.fit(X_train, y_train) #print(" ")
smoker_Yes smoker_No 0 0 1 1 0 1 2 0 1 3 0 1 4 0 1 5 0 1 6 0 1 7 0 1 8 0 1 9 0 1 10 0 1 11 0 1 12 0 1 13 0 1 14 0 1 15 0 1 16 0 1 17 0 1 18 0 1 19 0 1 20 0 1 21 0 1 22 0 1 23 0 1 24 0 1 25 0 1 26 0 1 27 0 1 28 0 1 29 0 1 .. ... ... 214 1 0 215 1 0 216 1 0 217 1 0 218 1 0 219 1 0 220 1 0 221 1 0 222 1 0 223 0 1 224 1 0 225 1 0 226 1 0 227 0 1 228 0 1 229 1 0 230 1 0 231 1 0 232 0 1 233 0 1 234 1 0 235 0 1 236 1 0 237 1 0 238 0 1 239 0 1 240 1 0 241 1 0 242 0 1 243 0 1 [244 rows x 2 columns] LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
beta_0 = regModel.intercept_ print(beta_0)
[ 2.97099262]
averagey = y.mean() print(averagey)
tip 2.998279 dtype: float64
beta_1_1 = regModel.coef_[0,0] print(beta_1_1)
0.0555225330225
beta_1_2 = regModel.coef_[0,1] print(beta_1_2)
-0.0555225330225
y_pred = regModel.predict(X_test) regModel_score = r2_score(y_test, y_pred) print("R^2 score for the model is ", regModel_score)
('R^2 score for the model is ', -0.023538757252183018)
print(X.shape)
(244, 2) smoker_Yes smoker_No 0 0 1 1 0 1 2 0 1 3 0 1 4 0 1 5 0 1 6 0 1 7 0 1 8 0 1 9 0 1 10 0 1 11 0 1 12 0 1 13 0 1 14 0 1 15 0 1 16 0 1 17 0 1 18 0 1 19 0 1 20 0 1 21 0 1 22 0 1 23 0 1 24 0 1 25 0 1 26 0 1 27 0 1 28 0 1 29 0 1 .. ... ... 214 1 0 215 1 0 216 1 0 217 1 0 218 1 0 219 1 0 220 1 0 221 1 0 222 1 0 223 0 1 224 1 0 225 1 0 226 1 0 227 0 1 228 0 1 229 1 0 230 1 0 231 1 0 232 0 1 233 0 1 234 1 0 235 0 1 236 1 0 237 1 0 238 0 1 239 0 1 240 1 0 241 1 0 242 0 1 243 0 1 [244 rows x 2 columns]
print(y.shape)
(244, 1)
ya = np.array(y).reshape(-1,) print(ya)
[ 1.01 1.66 3.5 3.31 3.61 4.71 2. 3.12 1.96 3.23 1.71 5. 1.57 3. 3.02 3.92 1.67 3.71 3.5 3.35 4.08 2.75 2.23 7.58 3.18 2.34 2. 2. 4.3 3. 1.45 2.5 3. 2.45 3.27 3.6 2. 3.07 2.31 5. 2.24 2.54 3.06 1.32 5.6 3. 5. 6. 2.05 3. 2.5 2.6 5.2 1.56 4.34 3.51 3. 1.5 1.76 6.73 3.21 2. 1.98 3.76 2.64 3.15 2.47 1. 2.01 2.09 1.97 3. 3.14 5. 2.2 1.25 3.08 4. 3. 2.71 3. 3.4 1.83 5. 2.03 5.17 2. 4. 5.85 3. 3. 3.5 1. 4.3 3.25 4.73 4. 1.5 3. 1.5 2.5 3. 2.5 3.48 4.08 1.64 4.06 4.29 3.76 4. 3. 1. 4. 2.55 4. 3.5 5.07 1.5 1.8 2.92 2.31 1.68 2.5 2. 2.52 4.2 1.48 2. 2. 2.18 1.5 2.83 1.5 2. 3.25 1.25 2. 2. 2. 2.75 3.5 6.7 5. 5. 2.3 1.5 1.36 1.63 1.73 2. 2.5 2. 2.74 2. 2. 5.14 5. 3.75 2.61 2. 3.5 2.5 2. 2. 3. 3.48 2.24 4.5 1.61 2. 10. 3.16 5.15 3.18 4. 3.11 2. 2. 4. 3.55 3.68 5.65 3.5 6.5 3. 5. 3.5 2. 3.5 4. 1.5 4.19 2.56 2.02 4. 1.44 2. 5. 2. 2. 4. 2.01 2. 2.5 4. 3.23 3.41 3. 2.03 2.23 2. 5.16 9. 2.5 6.5 1.1 3. 1.5 1.44 3.09 2.2 3.48 1.92 3. 1.58 2.5 2. 3. 2.72 2.88 2. 3. 3.39 1.47 3. 1.25 1. 1.17 4.67 5.92 2. 2. 1.75 3. ]
xyes = np.array(X['smoker_Yes']) print(xyes)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0]
xno = np.array(X['smoker_No']) print(xno)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1]
fig = plt.figure(figsize=(12,8)) ax = fig.gca(projection='3d') ax.scatter3D(xyes, xno, ya, c=ya, cmap='Reds', s=80) ax.set_xlabel('smokers', fontsize=20) ax.set_ylabel('non smokers', fontsize=20) ax.set_zlabel('tip', fontsize=20) plt.title('Actual Observations', fontsize=20) plt.show() #ax.legend()
<mpl_toolkits.mplot3d.art3d.Path3DCollection object at 0x7f3dc279f650> <matplotlib.text.Text object at 0x7f3dc28bebd0> <matplotlib.text.Text object at 0x7f3dc27ee7d0> <matplotlib.text.Text object at 0x7f3dc2805250> <matplotlib.text.Text object at 0x7f3dc2787550>
def f(x1, x2, b0, b1, b2): return b0 + b1 * x1 + b2 * x2
esttips = f(xyes, xno, beta_0, beta_1_1, beta_1_2)
fig = plt.figure(figsize=(12,8)) plt.plot(ya) plt.plot(esttips) plt.show()
[<matplotlib.lines.Line2D object at 0x7f3dc2755e50>] [<matplotlib.lines.Line2D object at 0x7f3dc2755f90>]
fig = plt.figure(figsize=(12,8)) ax = fig.gca(projection='3d') ax.plot_wireframe(xx1, xx2, Z, rstride=5, cstride=6, alpha=0.2) #cmap='cubehelix', # ax.scatter3D(xbill, xsize, ya, color = 'green', s=80) #c=ya, ax.set_xlabel('bill', fontsize=20) ax.set_ylabel('size of party', fontsize=20) ax.set_zlabel('tip', fontsize=20) ax.set_zlim(0, ya.max()) plt.title('Linear Model to Estimate/Predict Tip', fontsize=20) ax.legend() plt.show()
︠4219cc74-6832-4a31-a931-a815c642a462︠