CoCalc -- HOUSE PRICING PREDICTION .ipynb

⁶⁵ views
ubuntu2004

Kernel: Python 3 (system-wide)

IMPORTING LIBRARIES

In [4]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

READING CSV FILE

In [5]:

df = pd.read_csv("./Housing.csv")

In [6]:

df.head()

Out[6]:

In [7]:

df.shape

Out[7]:

(545, 13)

In [8]:

df.info()

Out[8]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB

In [9]:

df.describe().T

Out[9]:

In [16]:

print("Missing Values by Column")
print("-"*30)
print(df.isna().sum())
print("-"*30)
print("TOTAL MISSING VALUES:",df.isna().sum().sum())

Out[16]:

Missing Values by Column
------------------------------
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64
------------------------------
TOTAL MISSING VALUES: 0

In [35]:

columns_to_drop = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

In [36]:

df.describe()

Out[36]:

In [37]:

df.info()

Out[37]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   price      545 non-null    int64
 1   area       545 non-null    int64
 2   bedrooms   545 non-null    int64
 3   bathrooms  545 non-null    int64
 4   stories    545 non-null    int64
 5   parking    545 non-null    int64
 6   SalePrice  545 non-null    int64
dtypes: int64(7)
memory usage: 29.9 KB

Checking for null values

In [38]:

df.isnull().sum()

Out[38]:

price        0
area         0
bedrooms     0
bathrooms    0
stories      0
parking      0
SalePrice    0
dtype: int64

General corellation analysis

In [40]:

a4_dims = (10, 8)
fig, ax = plt.subplots(figsize=a4_dims)
cor = df.corr()
sns.heatmap(cor, annot = True, cmap="YlGnBu")

Out[40]:

<AxesSubplot: >

Analysis on number of bedroom feature

In [41]:

a4_dims = (15, 5)
fig, ax = plt.subplots(figsize=a4_dims)
sns.barplot(x = df.bedrooms, y = df.price)

Out[41]:

<AxesSubplot: xlabel='bedrooms', ylabel='price'>

In [42]:

df.groupby('bedrooms').price.agg([len, min, max])

Out[42]:

In [44]:

df1 = df[(df.bedrooms > 0) & (df.bedrooms < 9)].copy()
df1.shape

Out[44]:

(545, 7)

Analysis on number of bedroom feature

In [46]:

a4_dims = (5, 18)
fig, ax = plt.subplots(figsize=a4_dims)
sns.barplot(ax = ax, x = df.price, y = df.bathrooms)

Out[46]:

<AxesSubplot: xlabel='price', ylabel='bathrooms'>

In [47]:

a4_dims = (15, 8)
fig, ax = plt.subplots(figsize=a4_dims)
sns.distplot(a = df.price, bins = 1000, color = 'r', ax = ax)

Out[47]:

<AxesSubplot: xlabel='price', ylabel='Density'>

In [48]:

df.price.agg([min, max])

Out[48]:

min     1750000
max    13300000
Name: price, dtype: int64

In [55]:

df.info()

Out[55]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   price      545 non-null    int64
 1   area       545 non-null    int64
 2   bedrooms   545 non-null    int64
 3   bathrooms  545 non-null    int64
 4   stories    545 non-null    int64
 5   parking    545 non-null    int64
 6   SalePrice  545 non-null    int64
dtypes: int64(7)
memory usage: 29.9 KB

In [50]:

a4_dims = (15, 5)
fig, ax = plt.subplots(figsize=a4_dims)
sns.barplot(x = df.bathrooms, y = df.price)

Out[50]:

<AxesSubplot: xlabel='bathrooms', ylabel='price'>

Analysis on all the instances whose price is 0

In [51]:

zero_price = df[(df.price == 0)].copy()
zero_price.shape

Out[51]:

(0, 7)

In [53]:

zero_price.head()

Out[53]:

In [56]:

df.head()

Out[56]:

Splitting into train and test set

In [57]:

X = df.iloc[:, 1:]
X.shape

Out[57]:

(545, 6)

In [58]:

y = df.price

In [59]:

from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.1, random_state=42)
print(len(X_train) / len(df))

Out[59]:

0.8990825688073395

In [60]:

from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.1, random_state=42)
print(len(X_train) / len(df))

Out[60]:

0.8990825688073395

In [61]:

X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)
print(len(X_test) / len(y_rem))

Out[61]:

0.509090909090909

In [62]:

print(len(X_train))
print(len(X_val))
print(len(X_val))

Out[62]:

490
27
27

Linear regression

In [63]:

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [64]:

lin_reg.fit(X_train, y_train)

Out[64]:

In [65]:

from sklearn.metrics import mean_squared_error
y_pred = lin_reg.predict(X_val)
mse = mean_squared_error(y_pred, y_val)
rmse = np.sqrt(mse)
rmse

Out[65]:

2.537905438995759e-09

In [66]:

y_val.head(10)

Out[66]:

   3290000
    12250000
   2660000
   4060000
   3003000
   4900000
   2520000
   5495000
    6650000
   5425000
Name: price, dtype: int64

In [67]:

y_pred

Out[67]:

array([ 3290000.        , 12250000.00000001,  2660000.        ,
        4060000.        ,  3003000.        ,  4900000.        ,
        2520000.        ,  5495000.        ,  6650000.        ,
        5425000.        ,  3710000.        ,  8400000.        ,
        2380000.        ,  4200000.        ,  5250000.        ,
        3150000.        , 10150000.        ,  1890000.        ,
        2940000.        ,  3234000.        ,  6720000.        ,
        4543000.        ,  6650000.        ,  2275000.        ,
        9800000.        ,  2450000.        ,  3500000.        ])

In [68]:

y_pred_test = lin_reg.predict(X_test)
mse = mean_squared_error(y_pred_test, y_test)
rmse = np.sqrt(mse)
rmse

Out[68]:

1.797584403878829e-09

In [69]:

lin_reg.score(X_test, y_test)

Out[69]:

1.0

In [70]:

y_test

Out[70]:

  3500000
  4900000
   6650000
  3360000
  2870000
  1820000
  3675000
  3080000
  4907000
  3360000
  4270000
   6440000
  2940000
  3325000
  2450000
   7350000
  5250000
  2800000
  5740000
  5110000
   6580000
  4480000
  4007500
  2660000
  3353000
  2660000
  4193000
   6790000
Name: price, dtype: int64

In [71]:

y_pred_test

Out[71]:

array([3500000., 4900000., 6650000., 3360000., 2870000., 1820000.,
       3675000., 3080000., 4907000., 3360000., 4270000., 6440000.,
       2940000., 3325000., 2450000., 7350000., 5250000., 2800000.,
       5740000., 5110000., 6580000., 4480000., 4007500., 2660000.,
       3353000., 2660000., 4193000., 6790000.])

Decision tree regression

In [72]:

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(random_state = 42, max_depth = 10)

In [73]:

reg.fit(X_train, y_train)

Out[73]:

In [74]:

reg.score(X_test, y_test)

Out[74]:

0.9998553122209912

In [75]:

y_val.head(10)

Out[75]:

   3290000
    12250000
   2660000
   4060000
   3003000
   4900000
   2520000
   5495000
    6650000
   5425000
Name: price, dtype: int64

IMPORTING LIBRARIES

READING CSV FILE

General corellation analysis

Analysis on number of bedroom feature

Analysis on number of bedroom feature

Analysis on all the instances whose price is 0

Splitting into train and test set

Linear regression

Decision tree regression

Product

Resources

Company