Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_10-sub-Jacob_Koehler/05-cali_ex.ipynb
1904 views
Kernel: Python 3

California Housing

%matplotlib notebook import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split import seaborn as sns from pandas.plotting import scatter_matrix

Load the Data: Visualize It

cali = pd.read_csv('data/cali_housing.csv', index_col = 0)
cali.head()
plt.figure(figsize = (9, 7)) plt.scatter(cali.latitude, cali.longitude, alpha = 0.2, c = cali.median_house_value, s = cali.population/100) plt.colorbar()

Looking for Relationships

cali.hist(bins = 40, figsize = (9, 6));

Split the Data

y = cali['median_house_value'] X = cali.drop('median_house_value', axis = 1) X['total_bedrooms'].fillna(X.total_bedrooms.median(), inplace = True)
X['rooms_per_house'] = X['total_rooms']/X['households'] X['bedrooms_per_room'] = X['total_bedrooms']/X['total_rooms'] X['population_per_household'] = X['population']/X['households'] X_train, X_test, y_train, y_test = train_test_split(X, y)
len(X_train)
train = X_train.join(y_train) corr_mat = train.corr()
corr_mat['median_house_value'].sort_values(ascending = False)
cali_cols = train[['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']]
scatter_matrix(cali_cols);
lm = LinearRegression() X = train['median_income'].values.reshape(-1,1) y = train['median_house_value'] lm.fit(X,y) predict = lm.predict(X) mse = mean_squared_error(predict, y) np.sqrt(mse)

Categorical Data

train.info()
dummies = pd.get_dummies(train['ocean_proximity'])
train = train.join(dummies)
train.columns
train = train.drop('<1H OCEAN', axis = 1)
train.info()

Numerical Features

train = train.drop('ocean_proximity', axis = 1)
train.info()
y = train['median_house_value'] X = train.drop('median_house_value', axis = 1)

Scaling

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('std_scaler', StandardScaler()), ('poly_features', PolynomialFeatures(degree = 2))])
housing_prepared = pipeline.fit_transform(X)

Model

lm = LinearRegression() lm.fit(housing_prepared, y) lm.score(housing_prepared, y) lm.predict(housing_prepared[:10])
predictions = lm.predict(housing_prepared) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse
from sklearn.model_selection import GridSearchCV
lasso = Lasso(max_iter = 100000) alphas = [0.5, 1, 4, 10, 50, 100] param_grid = [ {'alpha': alphas} ] grid_search = GridSearchCV(lasso, param_grid, cv = 5, scoring = 'neg_mean_squared_error') grid_search.fit(housing_prepared, y)
grid_search.best_estimator_
lasso = grid_search.best_estimator_
lasso.fit(housing_prepared, y)
predictions = lasso.predict(housing_prepared) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse
enet = ElasticNet(max_iter = 100000) alphas = [0.1, 0.5, 1, 4, 10, 50, 100] param_grid = [ {'alpha': alphas} ] grid_search = GridSearchCV(enet, param_grid, cv = 5, scoring = 'neg_mean_squared_error') grid_search.fit(housing_prepared, y)
enet = grid_search.best_estimator_ enet.fit(housing_prepared, y)
predictions = enet.predict(housing_prepared) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse

Evaluate on Test Set

X_test['total_bedrooms'].fillna(X_test.total_bedrooms.median(), inplace = True) dummies = pd.get_dummies(X_test['ocean_proximity']) X_test = X_test.join(dummies) X_test = X_test.drop('ocean_proximity', axis = 1) y = y_test
scaler = StandardScaler() X = scaler.fit_transform(X_test) enet.fit(X, y) predictions = enet.predict(X) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse
lasso.fit(X, y) predictions = lasso.predict(X) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse
lm.fit(X, y) predictions = lm.predict(X) mse = mean_squared_error(predictions, y) rmse = np.sqrt(mse) rmse