Delhi House Price Prediction Using Machine Learning
Introduction to Machine Learning
Machine Learning is the field where computers perform tasks and learn without much human interaction.
Studying the concepts of Machine Learning isn’t enough to apply it in real world. Beside that you also have to learn basic statistics and probability for applying it. And the concepts related to statistics and probability, that I have gone through was from a YouTube channel called “Brandon Foltz”.
Libraries And Languages To Learn Machine Learning
Languages
- Python/R Programming
Libraries For python
- Pandas
- Matplotlib
- Numpy
Now let’s begin implementing House Price Prediction of Delhi using Machine Learning
Project Introduction
Here we predict the prices based on factors such as Location, Area, Type, Number of Bedrooms, etc. And the dataset is imported from Kaggle.com
https://www.kaggle.com/neelkamal692/delhi-house-price-prediction
➝Importing the Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
➝Loading the data set
df1=pd.read_csv(r'G:\Delhi_House_Price.csv')
df1.head(10)
Data Cleaning
➝Removing unnecessary Columns:
df2=df1.drop(['District','Status','Transaction','Locality'],axis='columns')
df2.head()
#The Per_Sqft column contains Nan Values and we replace them with the mean of the Per_Sqft value.
per_sqft_mean=df2['Per_Sqft'].mean()
per_sqft_mean
15574.88
➝Filling Per_Sqft column with Nan Values
df2['Per_Sqft']=df2['Per_Sqft'].fillna(per_sqft_mean)df2.isnull().sum()
Area 0
BHK 0
Bathroom 2
Furnishing 0
Location 0
Parking 32
Type 5
Per_Sqft 0
Price 0
#AS only few rows of Parking and others columns has Nan values, we ignore them
df3=df2.dropna()
df3.isnull().sum()
Removing Outliers
#Let us come up with a constraint that 'Average Area of a BHK is 300sqft'
df4=df3[~(df3['Area']/df3['BHK']<300)]
df4.Per_Sqft.describe()
count 1012.000000
mean 15816.696146
std 19671.959447
min 1259.000000
25% 7000.000000
50% 14722.000000
75% 15574.885920
max 183333.000000
Name: Per_Sqft, dtype: float64
➝The below function remove the outliers by considering values of 1 standard deviation.
def remove_per_sqft_outliers(df):
ndf=pd.DataFrame()
for key,subdf in df.groupby('Location'):
m=np.mean(subdf['Per_Sqft'])
sd=np.std(subdf['Per_Sqft'])
reduced_df=subdf[(subdf['Per_Sqft']>=(m-1*sd)) & (subdf['Per_Sqft']<=(m+1*sd))]
ndf=pd.concat([ndf,reduced_df],ignore_index=True)
return ndf
df5=remove_per_sqft_outliers(df4)
➝Plotting box plot for BHK and Per_Sqft to identify outliers
➝Removing BHK Outliers
def remove_bhk_outliers(df):
exclude_indices = np.array([])
for location, location_df in df.groupby('Location'):
bhk_stats = {}
for bhk, bhk_df in location_df.groupby('BHK'):
bhk_stats[bhk] = {
'mean': np.mean(bhk_df.Per_Sqft),
'std': np.std(bhk_df.Per_Sqft),
'count': bhk_df.shape[0]
}
for bhk, bhk_df in location_df.groupby('BHK'):
stats = bhk_stats.get(bhk-1)
if stats and stats['count']>5:
exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.Per_Sqft<(stats['mean'])].index.values)
return df.drop(exclude_indices,axis='index')
df6 = remove_bhk_outliers(df5)
➝To get an insight on Per_Sqft through histogram
plt.hist(df6.Per_Sqft,rwidth=0.8,color='purple')
plt.xlabel('price per sqft')
➝To get an insight on Bathroom
plt.hist(df6.Bathroom,rwidth=0.8)
plt.xlabel('Number of bathrooms')
plt.ylabel('count')
➝Finding Unique Locations
df3['Location'].unique()
array([‘Karol Bagh’, ‘Okhla’, ‘Laxmi Nagar’, ‘Vasundhara’, ‘Dilshad Garden’, ‘Geeta Colony’, ‘Budh Vihar’, ‘Rohini’, ‘Narela’, ‘Shahdara’, ‘Alaknanda’, ‘Chhattarpur’, ‘Chittaranjan Park’, ‘Friends Colony’, ‘Malviya Nagar’, ‘Hauz Khas’, ‘Lajpat Nagar’, ‘Mahavir Enclave’, ‘Mehrauli’, ‘Greater Kailash’, ‘Sheikh Sarai’, ‘Saket’, ‘Sarita Vihar’, ‘Kalkaji’, ‘Safdarjung Enclave’, ‘Vasant Kunj’, ‘Janakpuri’, ‘Manglapuri’, ‘Sultanpur’, ‘Dwarka’, ‘Moti Nagar’, ‘Paschim Vihar’, ‘Patel Nagar’, ‘Punjabi Bagh’, ‘Uttam Nagar’], dtype=object)
➝Creating Dummies for the Locations, Furnishing and Type
dummies1=pd.get_dummies(df7.Location)
dummies2=pd.get_dummies(df7.Furnishing)
dummies3=pd.get_dummies(df7.Type)
df8=pd.concat([df7,dummies1,dummies2,dummies3],axis='columns')df9=df8.drop(['Location','Type','Furnishing'],axis='columns')
df9=df9.drop(['Per_Sqft'],axis='columns')
df9.shape
(619, 45)
x=df9.drop('Price',axis='columns')y=df9.Price
#Using GridSearchCv selecting best Machine Learning Model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
def find_best_model_using_gridsearchcv(x,y):
algos = {
'linear_regression' : {
'model': LinearRegression(),
'params': {
'normalize': [True, False]
}
},
'lasso': {
'model': Lasso(),
'params': {
'alpha': [1,2],
'selection': ['random', 'cyclic']
}
},
'decision_tree': {
'model': DecisionTreeRegressor(),
'params': {
'criterion' : ['mse','friedman_mse'],
'splitter': ['best','random']
}
},
}
scores = []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for algo_name, config in algos.items():
gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
gs.fit(x,y)
scores.append({
'model': algo_name,
'best_score': gs.best_score_,
'best_params': gs.best_params_
})
return pd.DataFrame(scores,columns=['model','best_score','best_params'])
find_best_model_using_gridsearchcv(x,y)
➝Importing Train-Test split from sklearn
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)from sklearn import tree
model=tree.DecisionTreeRegressor(criterion='mse',splitter='best')model.fit(x_train,y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
#Score
model.score(x_test,y_test)
0.6882166017282265
To predict Price
def predict_price(Location,Type,Fur,Area,BHK,Bathroom,Parking):
loc_index = np.where(x.columns==Location)[0][0]
Fur_index=np.where(x.columns==Fur)[0][0]
type_index=np.where(x.columns==Type)[0][0]
u = np.zeros(len(x.columns))
u[0] = Area
u[1] = Bathroom
u[2] = BHK
u[3] = Parking
if loc_index >= 0:
u[loc_index] = 1
if Fur_index >= 0:
u[Fur_index] = 1
if type_index >= 0:
u[type_index] = 1
return model.predict([u])[0]predict_price('Karol Bagh','Builder_Floor','Furnished',1280,3,3,2)
Output:
15000000.0
Conclusion
Hence we build a Machine Learning Model for House Price Prediction in Delhi. I hope this article is useful to you.