Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
65 views
ubuntu2004
Kernel: Python 3 (system-wide)

IMPORTING LIBRARIES

from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from xgboost import XGBRegressor from sklearn.preprocessing import PolynomialFeatures

READING CSV FILE

df = pd.read_csv("./Housing.csv")
df.head()
df.shape
(545, 13)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 545 entries, 0 to 544 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 545 non-null int64 1 area 545 non-null int64 2 bedrooms 545 non-null int64 3 bathrooms 545 non-null int64 4 stories 545 non-null int64 5 mainroad 545 non-null object 6 guestroom 545 non-null object 7 basement 545 non-null object 8 hotwaterheating 545 non-null object 9 airconditioning 545 non-null object 10 parking 545 non-null int64 11 prefarea 545 non-null object 12 furnishingstatus 545 non-null object dtypes: int64(6), object(7) memory usage: 55.5+ KB
df.describe().T
print("Missing Values by Column") print("-"*30) print(df.isna().sum()) print("-"*30) print("TOTAL MISSING VALUES:",df.isna().sum().sum())
Missing Values by Column ------------------------------ price 0 area 0 bedrooms 0 bathrooms 0 stories 0 mainroad 0 guestroom 0 basement 0 hotwaterheating 0 airconditioning 0 parking 0 prefarea 0 furnishingstatus 0 dtype: int64 ------------------------------ TOTAL MISSING VALUES: 0
columns_to_drop = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus'] df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')
df.describe()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 545 entries, 0 to 544 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 545 non-null int64 1 area 545 non-null int64 2 bedrooms 545 non-null int64 3 bathrooms 545 non-null int64 4 stories 545 non-null int64 5 parking 545 non-null int64 6 SalePrice 545 non-null int64 dtypes: int64(7) memory usage: 29.9 KB

Checking for null values

df.isnull().sum()
price 0 area 0 bedrooms 0 bathrooms 0 stories 0 parking 0 SalePrice 0 dtype: int64

General corellation analysis

a4_dims = (10, 8) fig, ax = plt.subplots(figsize=a4_dims) cor = df.corr() sns.heatmap(cor, annot = True, cmap="YlGnBu")
<AxesSubplot: >
Image in a Jupyter notebook

Analysis on number of bedroom feature

a4_dims = (15, 5) fig, ax = plt.subplots(figsize=a4_dims) sns.barplot(x = df.bedrooms, y = df.price)
<AxesSubplot: xlabel='bedrooms', ylabel='price'>
Image in a Jupyter notebook
df.groupby('bedrooms').price.agg([len, min, max])
df1 = df[(df.bedrooms > 0) & (df.bedrooms < 9)].copy() df1.shape
(545, 7)

Analysis on number of bedroom feature

a4_dims = (5, 18) fig, ax = plt.subplots(figsize=a4_dims) sns.barplot(ax = ax, x = df.price, y = df.bathrooms)
<AxesSubplot: xlabel='price', ylabel='bathrooms'>
Image in a Jupyter notebook
a4_dims = (15, 8) fig, ax = plt.subplots(figsize=a4_dims) sns.distplot(a = df.price, bins = 1000, color = 'r', ax = ax)
<AxesSubplot: xlabel='price', ylabel='Density'>
Image in a Jupyter notebook
df.price.agg([min, max])
min 1750000 max 13300000 Name: price, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 545 entries, 0 to 544 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 545 non-null int64 1 area 545 non-null int64 2 bedrooms 545 non-null int64 3 bathrooms 545 non-null int64 4 stories 545 non-null int64 5 parking 545 non-null int64 6 SalePrice 545 non-null int64 dtypes: int64(7) memory usage: 29.9 KB
a4_dims = (15, 5) fig, ax = plt.subplots(figsize=a4_dims) sns.barplot(x = df.bathrooms, y = df.price)
<AxesSubplot: xlabel='bathrooms', ylabel='price'>
Image in a Jupyter notebook

Analysis on all the instances whose price is 0

zero_price = df[(df.price == 0)].copy() zero_price.shape
(0, 7)
zero_price.head()
df.head()

Splitting into train and test set

X = df.iloc[:, 1:] X.shape
(545, 6)
y = df.price
from sklearn.model_selection import train_test_split X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.1, random_state=42) print(len(X_train) / len(df))
0.8990825688073395
from sklearn.model_selection import train_test_split X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.1, random_state=42) print(len(X_train) / len(df))
0.8990825688073395
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42) print(len(X_test) / len(y_rem))
0.509090909090909
print(len(X_train)) print(len(X_val)) print(len(X_val))
490 27 27

Linear regression

from sklearn.linear_model import LinearRegression lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
from sklearn.metrics import mean_squared_error y_pred = lin_reg.predict(X_val) mse = mean_squared_error(y_pred, y_val) rmse = np.sqrt(mse) rmse
2.537905438995759e-09
y_val.head(10)
431 3290000 2 12250000 497 2660000 316 4060000 473 3003000 210 4900000 512 2520000 158 5495000 77 6650000 163 5425000 Name: price, dtype: int64
y_pred
array([ 3290000. , 12250000.00000001, 2660000. , 4060000. , 3003000. , 4900000. , 2520000. , 5495000. , 6650000. , 5425000. , 3710000. , 8400000. , 2380000. , 4200000. , 5250000. , 3150000. , 10150000. , 1890000. , 2940000. , 3234000. , 6720000. , 4543000. , 6650000. , 2275000. , 9800000. , 2450000. , 3500000. ])
y_pred_test = lin_reg.predict(X_test) mse = mean_squared_error(y_pred_test, y_test) rmse = np.sqrt(mse) rmse
1.797584403878829e-09
lin_reg.score(X_test, y_test)
1.0
y_test
398 3500000 209 4900000 79 6650000 424 3360000 486 2870000 540 1820000 367 3675000 463 3080000 199 4907000 422 3360000 284 4270000 90 6440000 483 2940000 429 3325000 516 2450000 55 7350000 176 5250000 493 2800000 137 5740000 184 5110000 83 6580000 255 4480000 324 4007500 499 2660000 426 3353000 498 2660000 304 4193000 70 6790000 Name: price, dtype: int64
y_pred_test
array([3500000., 4900000., 6650000., 3360000., 2870000., 1820000., 3675000., 3080000., 4907000., 3360000., 4270000., 6440000., 2940000., 3325000., 2450000., 7350000., 5250000., 2800000., 5740000., 5110000., 6580000., 4480000., 4007500., 2660000., 3353000., 2660000., 4193000., 6790000.])

Decision tree regression

from sklearn.tree import DecisionTreeRegressor reg = DecisionTreeRegressor(random_state = 42, max_depth = 10)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)
0.9998553122209912
y_val.head(10)
431 3290000 2 12250000 497 2660000 316 4060000 473 3003000 210 4900000 512 2520000 158 5495000 77 6650000 163 5425000 Name: price, dtype: int64