import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime as dt
import seaborn as sns


pip install plotly

Requirement already satisfied: plotly in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (5.3.1)
Requirement already satisfied: tenacity>=6.2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from plotly) (8.0.1)
Requirement already satisfied: six in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from plotly) (1.16.0)
WARNING: You are using pip version 21.0; however, version 21.2.4 is available.
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.


files=['resale-flat-prices-based-on-approval-date-1990-1999.csv','resale-flat-prices-based-on-approval-date-2000-feb-2012.csv','resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv','resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv','resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv']


for x in files:
    df=pd.read_csv(x)
    if x==files[0]:
        dataset=df
        
    else:
        dataset=pd.concat([dataset,df])


df.head()


dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 826581 entries, 0 to 52202
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                826581 non-null  object 
 1   town                 826581 non-null  object 
 2   flat_type            826581 non-null  object 
 3   block                826581 non-null  object 
 4   street_name          826581 non-null  object 
 5   storey_range         826581 non-null  object 
 6   floor_area_sqm       826581 non-null  float64
 7   flat_model           826581 non-null  object 
 8   lease_commence_date  826581 non-null  int64  
 9   resale_price         826581 non-null  float64
 10  remaining_lease      117527 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 75.7+ MB


#after combining the multiple datasets, sorting by lastest data i.e 2020
dataset['month']=pd.to_datetime(dataset['month'])
dataset.sort_values('month',ascending=False,inplace=True)
dataset.head()


dataset.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease'],
      dtype='object')


#number of null values
dataset.isnull().sum()

month                       0
town                        0
flat_type                   0
block                       0
street_name                 0
storey_range                0
floor_area_sqm              0
flat_model                  0
lease_commence_date         0
resale_price                0
remaining_lease        709054
dtype: int64


#percent of null values in the remaining lease field
dataset.isnull().mean()

month                  0.000000
town                   0.000000
flat_type              0.000000
block                  0.000000
street_name            0.000000
storey_range           0.000000
floor_area_sqm         0.000000
flat_model             0.000000
lease_commence_date    0.000000
resale_price           0.000000
remaining_lease        0.857816
dtype: float64


#creating new column to calculate remaining lease on our own to eradicate null values.
dataset['new_remaining_lease']=99-(2021-dataset['lease_commence_date'])
dataset.head()


print(dataset['town'].unique())
print(dataset['storey_range'].unique())
print(dataset['flat_type'].unique())

['YISHUN' 'HOUGANG' 'GEYLANG' 'JURONG EAST' 'CHOA CHU KANG' 'CLEMENTI'
 'KALLANG/WHAMPOA' 'JURONG WEST' 'PASIR RIS' 'MARINE PARADE' 'BEDOK'
 'BISHAN' 'BUKIT BATOK' 'ANG MO KIO' 'BUKIT PANJANG' 'CENTRAL AREA'
 'BUKIT TIMAH' 'BUKIT MERAH' 'TAMPINES' 'TOA PAYOH' 'SENGKANG' 'SERANGOON'
 'WOODLANDS' 'PUNGGOL' 'QUEENSTOWN' 'SEMBAWANG' 'LIM CHU KANG']
['04 TO 06' '07 TO 09' '01 TO 03' '10 TO 12' '13 TO 15' '16 TO 18'
 '22 TO 24' '19 TO 21' '28 TO 30' '37 TO 39' '31 TO 33' '34 TO 36'
 '25 TO 27' '43 TO 45' '40 TO 42' '46 TO 48' '49 TO 51' '11 TO 15'
 '06 TO 10' '01 TO 05' '21 TO 25' '16 TO 20' '26 TO 30' '36 TO 40'
 '31 TO 35']
['EXECUTIVE' '4 ROOM' '3 ROOM' '5 ROOM' '2 ROOM' 'MULTI-GENERATION'
 '1 ROOM' 'MULTI GENERATION']


#cleaning flat_type field
dataset.loc[dataset['flat_type']=='MULTI GENERATION','flat_type']='MULTI-GENERATION'
print(dataset['flat_type'].unique())

['EXECUTIVE' '4 ROOM' '3 ROOM' '5 ROOM' '2 ROOM' 'MULTI-GENERATION'
 '1 ROOM']


# Renaming of flat model duplicates
replace_values = {'NEW GENERATION':'New Generation', 'SIMPLIFIED':'Simplified', 'STANDARD':'Standard', 'MODEL A-MAISONETTE':'Maisonette', 'MULTI GENERATION':'Multi Generation', 'IMPROVED-MAISONETTE':'Executive Maisonette', 'Improved-Maisonette':'Executive Maisonette', 'Premium Maisonette':'Executive Maisonette', '2-ROOM':'2-room', 'MODEL A':'Model A', 'MAISONETTE':'Maisonette', 'Model A-Maisonette':'Maisonette', 'IMPROVED':'Improved', 'TERRACE':'Terrace', 'PREMIUM APARTMENT':'Premium Apartment', 'Premium Apartment Loft':'Premium Apartment', 'APARTMENT':'Apartment', 'Type S1':'Type S1S2', 'Type S2':'Type S1S2'}

dataset = dataset.replace({'flat_model': replace_values})

dataset['flat_model'].value_counts()

Model A                 228389
Improved                217356
New Generation          177570
Simplified               53960
Standard                 39854
Premium Apartment        35066
Apartment                32004
Maisonette               28798
Model A2                  9109
DBSS                      1609
Adjoined flat             1085
Terrace                    642
Multi Generation           502
Type S1S2                  401
Executive Maisonette       196
2-room                      40
Name: flat_model, dtype: int64


ft = dataset['flat_type'].value_counts()/len(dataset)*100
ft

4 ROOM              37.420894
3 ROOM              32.976804
5 ROOM              20.616007
EXECUTIVE            7.578326
2 ROOM               1.193228
1 ROOM               0.154008
MULTI-GENERATION     0.060732
Name: flat_type, dtype: float64


#distributions of numerical features
numerical = ['resale_price', 'new_remaining_lease', 'lease_commence_date']
dataset[numerical].hist(bins=20, layout=(1, 3), figsize=(10,5))

plt.tight_layout()
plt.show()


df2 = pd.DataFrame(np.array([['4 room', 37.420894], ['3 room', 32.976804],['5 room', 20.616007],['Exec', 7.578326],['2 room', 1.193228],['1 room', 0.154008],['Multi Gen', 0.060732]]),
...                    columns=['flat type', 'proportion of ownership'])

#covert column from object to integer
df2['proportion of ownership']=pd.to_numeric(df2['proportion of ownership'])
df2.dtypes

flat type                   object
proportion of ownership    float64
dtype: object


# Plot the graph based on flat type ownership
fig = px.bar(df2, 
             x="flat type", 
             y="proportion of ownership", 
             color="flat type",
            title='HDB flat type ownership over the past 30 years (from 1990 to 2020)')

# Display the graph
fig.show()


#analysing flat type ownership for recent 5 years, has it changed?
dataset['month']=pd.to_datetime(dataset['month'])
ft2 = dataset.set_index('month')['2015':'2019'].reset_index()['flat_type'].value_counts()/len(dataset.set_index('month')['2015':'2019'].reset_index())*100
ft2

Series([], Name: flat_type, dtype: float64)


df3 = pd.DataFrame(np.array([['4 room', 41.210346], ['3 room', 25.254169],['5 room', 24.308493],['Exec', 7.897721],['2 room', 1.249396],['1 room', 0.042403],['Multi Gen', 0.037472]]),
...                    columns=['flat type', 'proportion of ownership'])

#covert column from object to integer
df3['proportion of ownership']=pd.to_numeric(df2['proportion of ownership'])
df3.dtypes

flat type                   object
proportion of ownership    float64
dtype: object


# Plot the graph based on  flat type ownership over past 5 years
fig = px.bar(df3, 
             x="flat type", 
             y="proportion of ownership", 
             color="flat type",
            title='HDB flat type ownership over the past 5 years (from 2015 to 2020)')

# Display the graph
fig.show()


dataset['year'] = pd.DatetimeIndex(dataset['month']).year # extract out year


d4 = dataset.groupby(['town','year','flat_type'], as_index=False).agg({'resale_price': 'median'}).sort_values('resale_price', ascending=True).reset_index(drop=True)
d4


d4.sort_values(['town','year','flat_type'],ascending=[True,True,True])
d4.rename(columns={'resale_price':'median_resale_price'})


d5=d4[d4['flat_type']=='4 ROOM'].sort_values('year',ascending=True)
towns_of_interest = ['TOA PAYOH','QUEENSTOWN', 'CENTRAL AREA','ANG MO KIO','BUKIT MERAH','BUKIT TIMAH','TAMPINES','BEDOK','HOUGANG','PUNGGOL','GEYLANG','KALLANG/WHAMPOA','CHUA CHU KANG','YIO CHU KANG','JURONG','WOODLANDS','YISHUN','SENGKANG']

import plotly.express as px
# Plot the graph
# isin() will check for the countries within the countries_of_interest list
fig = px.line(d5[d5['town'].isin(towns_of_interest)], 
              x="year", 
              y="resale_price", 
              color='town',
              title="Median resale prices of 4 room HDB flats over the years"
              
              )


# Display the graph
fig.show()


d6=d4[d4['flat_type']=='5 ROOM'].sort_values('year',ascending=True)
towns_of_interest = ['TOA PAYOH','QUEENSTOWN', 'CENTRAL AREA','ANG MO KIO','BUKIT MERAH','BUKIT TIMAH','TAMPINES','BEDOK','HOUGANG','PUNGGOL','GEYLANG','KALLANG/WHAMPOA','CHUA CHU KANG','YIO CHU KANG','JURONG','WOODLANDS','YISHUN','SENGKANG']

import plotly.express as px
# Plot the graph
# isin() will check for the countries within the countries_of_interest list
fig = px.line(d6[d6['town'].isin(towns_of_interest)], 
              x="year", 
              y="resale_price", 
              color='town',
              title="Median resale prices of 5 room HDB flats over the years"
              
              )


# Display the graph
fig.show()


dataset['flat_type'].unique()

array(['EXECUTIVE', '4 ROOM', '3 ROOM', '5 ROOM', '2 ROOM',
       'MULTI-GENERATION', '1 ROOM'], dtype=object)


#omitting 1 room flats as no longer relevant

dataset2=dataset[dataset['flat_type']!='1 ROOM']
dataset2[dataset2['flat_type']=='1 ROOM']


final_dataset=dataset2[['resale_price', 'town', 'flat_type', 'storey_range',
         'new_remaining_lease']]

final_dataset.head()
final_dataset.isnull().sum()

resale_price           0
town                   0
flat_type              0
storey_range           0
new_remaining_lease    0
dtype: int64


from sklearn.preprocessing import LabelEncoder
le_flat_type = LabelEncoder()
final_dataset['flat_type'] = le_flat_type.fit_transform(final_dataset['flat_type'])
final_dataset["flat_type"].unique()

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

array([4, 2, 1, 3, 0, 5])


le_storey_range = LabelEncoder()
final_dataset['storey_range'] = le_storey_range.fit_transform(final_dataset['storey_range'])
final_dataset["storey_range"].unique()

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

array([ 2,  4,  0,  5,  7,  8, 12, 10, 15, 20, 16, 18, 13, 22, 21, 23, 24,
        6,  3,  1, 11,  9, 14, 19, 17])


ol_town=final_dataset.groupby(['town'])['resale_price'].mean().sort_values()
ol_town

town
LIM CHU KANG        63660.937500
ANG MO KIO         239874.463305
YISHUN             249643.510617
GEYLANG            251508.648151
BUKIT BATOK        255545.940270
BEDOK              257304.950224
CLEMENTI           260035.284208
QUEENSTOWN         261982.385596
JURONG EAST        265365.564496
JURONG WEST        275388.321472
TOA PAYOH          279603.649199
KALLANG/WHAMPOA    289358.334017
WOODLANDS          290762.424659
HOUGANG            294900.554057
CENTRAL AREA       299062.255300
BUKIT PANJANG      307383.707821
SERANGOON          308692.979645
TAMPINES           314767.171861
MARINE PARADE      319968.169718
CHOA CHU KANG      323125.387344
BUKIT MERAH        326602.895051
SEMBAWANG          351349.202966
PASIR RIS          386514.535024
BISHAN             396656.506010
SENGKANG           401902.201948
BUKIT TIMAH        421715.937801
PUNGGOL            438753.064326
Name: resale_price, dtype: float64


from sklearn.preprocessing import LabelEncoder
le_town = LabelEncoder()
final_dataset['town'] = le_town.fit_transform(final_dataset['town'])
final_dataset["town"].unique()

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

array([26, 11, 10, 12,  8,  9, 14, 13, 17, 16,  1,  2,  3,  0,  5,  7,  6,
        4, 23, 24, 21, 22, 25, 18, 19, 20, 15])


final_dataset=final_dataset[['resale_price','town','new_remaining_lease','flat_type','storey_range']]
final_dataset.head()
final_dataset.isnull().sum()

resale_price           0
town                   0
new_remaining_lease    0
flat_type              0
storey_range           0
dtype: int64


# use scatter_matrix to see correlation of features
fig = px.scatter_matrix(final_dataset.iloc[:,1:])
fig.show()


final_dataset.head()


from sklearn.model_selection import train_test_split
# make resale price the dependent variable
y = final_dataset["resale_price"]
X = final_dataset.drop("resale_price", axis=1)


# split up the dataset into train and test sample
# with the testing size being 20%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
    )

print(X_train)

        town  new_remaining_lease  flat_type  storey_range
229034     1                   56          3             0
56411      4                   61          2             5
164111    25                   63          1             4
16533     23                   62          1             5
115011     0                   58          1             4
...      ...                  ...        ...           ...
177867     1                   57          1             0
18479     26                   66          2             0
50943      0                   56          2             4
101153     8                   77          2             5
234805    21                   77          3             4

[660246 rows x 4 columns]


from sklearn.linear_model import LinearRegression


linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# linear_reg = LinearRegression()
# linear_reg.fit(X, y.values)

LinearRegression()


y_pred = linear_reg.predict(X_test)


from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y_test, y_pred))


error

106496.94034497518


from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)


y_pred = dec_tree_reg.predict(X_test)


error = np.sqrt(mean_squared_error(y_test, y_pred))


error

91435.15386262562


import statsmodels.api as sm


# make all other variables the independent variables
# add_constant to estimate a constant to the linear model
x = sm.add_constant(X)

# fit and make predictions using the model
model = sm.OLS(y, x).fit()
#predictions = model.predict(x)

# print out the summary of the regression analysis
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           resale_price   R-squared:                       0.482
Model:                            OLS   Adj. R-squared:                  0.482
Method:                 Least Squares   F-statistic:                 1.923e+05
Date:                Sat, 04 Sep 2021   Prob (F-statistic):               0.00
Time:                        13:10:39   Log-Likelihood:            -1.0728e+07
No. Observations:              825308   AIC:                         2.146e+07
Df Residuals:                  825303   BIC:                         2.146e+07
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const               -1.372e+05    836.964   -163.955      0.000   -1.39e+05   -1.36e+05
town                -1082.3072     13.679    -79.121      0.000   -1109.118   -1055.497
new_remaining_lease  3931.2241     14.443    272.192      0.000    3902.917    3959.532
flat_type              8.2e+04    141.442    579.773      0.000    8.17e+04    8.23e+04
storey_range         7535.6585     46.372    162.505      0.000    7444.771    7626.546
==============================================================================
Omnibus:                   111664.087   Durbin-Watson:                   0.253
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           208662.489
Skew:                           0.873   Prob(JB):                         0.00
Kurtosis:                       4.738   Cond. No.                         480.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(X_train, y_train)

ExtraTreesRegressor()


print(model.feature_importances_)

[0.11302622 0.26171277 0.56806065 0.05720036]


feature_importances=pd.Series(model.feature_importances_,index=X.columns)
feature_importances.plot(kind='barh')
plt.show()


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


X_train.shape

(660246, 4)


# Using Ensembling technique , Random Forest to predict resale prices
from sklearn.ensemble import RandomForestRegressor
rf_random=RandomForestRegressor(random_state=42)


rf_random.fit(X_train, y_train)

RandomForestRegressor(random_state=42)


error = np.sqrt(mean_squared_error(y_test, y_pred))
print("${:,.02f}".format(error))

$91,435.15


from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'max_depth': [None, 2, 4, 6, 8, 10, 12]},
             scoring='neg_mean_squared_error')


regressor = gs.best_estimator_

regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
error = np.sqrt(mean_squared_error(y_test, y_pred))
print("${:,.02f}".format(error))

$91,435.15


#Hyperparameter tuning/optimization
#Randomized SearchCV

n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]
#number of features to consider at every split

max_features=['auto','sqrt']
#max number of levels in tree
max_depth= [int(x) for x in np.linspace(start=5,stop=30,num=6)]

# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]


from sklearn.model_selection import RandomizedSearchCV


# Create the random grid in a dictionary of key , value pairs
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()


# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)


rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total= 7.8min
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.8min remaining:    0.0s

[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total= 6.9min
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total= 6.7min
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10


rf_random.best_params_


predictions=rf_random.predict(X_test)


sns.distplot(y_test-predictions)


plt.scatter(y_test,predictions)


from sklearn import metrics


print('MSE:', metrics.mean_squared_error(y_test, predictions))

X


# town, new_remaining_lease, flat_type_ordinal_labels,storey_range_ordinal_labels
X = np.array([['GEYLANG', 60, '4 ROOM','07 TO 09' ]])
X

array([['GEYLANG', '60', '4 ROOM', '07 TO 09']], dtype='<U8')


X[:, 0] = le_town.transform(X[:,0])

X[:, 2] = le_flat_type.transform(X[:,2])
X[:, 3] = le_storey_range.transform(X[:,3])


X = X.astype(float)
X

array([[10., 60.,  2.,  4.]])


import pickle


data = {"model": regressor, "le_town": le_town,"le_flat_type": le_flat_type,"le_storey_range":le_storey_range}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)


with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_town = data["le_town"]
le_flat_type = data["le_flat_type"]
le_storey_range = data["le_storey_range"]


y_pred = regressor_loaded.predict(X)
y_pred

array([313397.95918367])


pip install streamlit

Requirement already satisfied: streamlit in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (0.88.0)
Requirement already satisfied: base58 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (2.1.0)
Requirement already satisfied: packaging in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (21.0)
Requirement already satisfied: pandas>=0.21.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (1.1.5)
Requirement already satisfied: gitpython!=3.1.19 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (3.1.18)
Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (2.26.0)
Requirement already satisfied: pillow>=6.2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (8.2.0)
Requirement already satisfied: python-dateutil in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (2.8.1)
Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (1.19.5)
Requirement already satisfied: cachetools>=4.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (4.2.2)
Requirement already satisfied: tzlocal in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (3.0)
Requirement already satisfied: protobuf!=3.11,>=3.6.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (3.17.3)
Requirement already satisfied: toml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (0.10.2)
Requirement already satisfied: click<8.0,>=7.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (7.1.2)
Requirement already satisfied: pyarrow in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (5.0.0)
Requirement already satisfied: pydeck>=0.1.dev5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (0.6.2)
Requirement already satisfied: altair>=3.2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (4.1.0)
Requirement already satisfied: attrs in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (21.2.0)
Requirement already satisfied: validators in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (0.18.2)
Requirement already satisfied: blinker in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (1.4)
Requirement already satisfied: tornado>=5.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (6.1)
Requirement already satisfied: astor in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from streamlit) (0.8.1)
Requirement already satisfied: entrypoints in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from altair>=3.2.0->streamlit) (0.3)
Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from altair>=3.2.0->streamlit) (3.0.1)
Requirement already satisfied: toolz in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from altair>=3.2.0->streamlit) (0.11.1)
Requirement already satisfied: jsonschema in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from altair>=3.2.0->streamlit) (3.2.0)
Requirement already satisfied: gitdb<5,>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from gitpython!=3.1.19->streamlit) (4.0.7)
Requirement already satisfied: typing-extensions>=3.7.4.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from gitpython!=3.1.19->streamlit) (3.10.0.0)
Requirement already satisfied: smmap<5,>=3.0.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19->streamlit) (4.0.0)
Requirement already satisfied: pytz>=2017.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pandas>=0.21.0->streamlit) (2021.1)
Requirement already satisfied: six>=1.9 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from protobuf!=3.11,>=3.6.0->streamlit) (1.16.0)
Requirement already satisfied: traitlets>=4.3.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pydeck>=0.1.dev5->streamlit) (4.3.3)
Requirement already satisfied: ipykernel>=5.1.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pydeck>=0.1.dev5->streamlit) (5.5.5)
Requirement already satisfied: ipywidgets>=7.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pydeck>=0.1.dev5->streamlit) (7.6.4)
Requirement already satisfied: appnope in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.1.2)
Requirement already satisfied: ipython>=5.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (7.16.1)
Requirement already satisfied: jupyter-client in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (7.0.2)
Requirement already satisfied: pexpect in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (4.8.0)
Requirement already satisfied: jedi>=0.10 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.18.0)
Requirement already satisfied: setuptools>=18.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (39.0.1)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (3.0.20)
Requirement already satisfied: decorator in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (4.4.2)
Requirement already satisfied: pygments in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (2.10.0)
Requirement already satisfied: pickleshare in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.7.5)
Requirement already satisfied: backcall in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.2.0)
Requirement already satisfied: widgetsnbextension~=3.5.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (3.5.1)
Requirement already satisfied: ipython-genutils~=0.2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.2.0)
Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (1.0.1)
Requirement already satisfied: nbformat>=4.2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (5.1.3)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jedi>=0.10->ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.8.2)
Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jinja2->altair>=3.2.0->streamlit) (2.0.1)
Requirement already satisfied: jupyter-core in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (4.7.1)
Requirement already satisfied: pyrsistent>=0.14.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jsonschema->altair>=3.2.0->streamlit) (0.18.0)
Requirement already satisfied: importlib-metadata in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jsonschema->altair>=3.2.0->streamlit) (4.6.4)
Requirement already satisfied: wcwidth in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=5.0.0->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (0.2.5)
Requirement already satisfied: notebook>=4.4.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (6.4.3)
Requirement already satisfied: prometheus-client in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.11.0)
Requirement already satisfied: argon2-cffi in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (21.1.0)
Requirement already satisfied: terminado>=0.8.3 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.11.1)
Requirement already satisfied: pyzmq>=17 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (22.2.1)
Requirement already satisfied: nbconvert in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (6.0.7)
Requirement already satisfied: Send2Trash>=1.5.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (1.8.0)
Requirement already satisfied: nest-asyncio>=1.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jupyter-client->ipykernel>=5.1.2->pydeck>=0.1.dev5->streamlit) (1.5.1)
Requirement already satisfied: ptyprocess in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from terminado>=0.8.3->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.7.0)
Requirement already satisfied: cffi>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (1.14.6)
Requirement already satisfied: pycparser in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (2.20)
Requirement already satisfied: zipp>=0.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from importlib-metadata->jsonschema->altair>=3.2.0->streamlit) (3.5.0)
Requirement already satisfied: jupyterlab-pygments in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.1.2)
Requirement already satisfied: bleach in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (4.1.0)
Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.5.4)
Requirement already satisfied: mistune<2,>=0.8.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.8.4)
Requirement already satisfied: testpath in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.5.0)
Requirement already satisfied: defusedxml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.7.1)
Requirement already satisfied: pandocfilters>=1.4.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (1.4.3)
Requirement already satisfied: async-generator in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (1.10)
Requirement already satisfied: webencodings in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->pydeck>=0.1.dev5->streamlit) (0.5.1)
Requirement already satisfied: pyparsing>=2.0.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from packaging->streamlit) (2.4.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from requests->streamlit) (1.26.6)
Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from requests->streamlit) (2021.5.30)
Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from requests->streamlit) (3.2)
Requirement already satisfied: charset-normalizer~=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from requests->streamlit) (2.0.4)
Requirement already satisfied: backports.zoneinfo in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from tzlocal->streamlit) (0.2.1)
Requirement already satisfied: importlib-resources in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from backports.zoneinfo->tzlocal->streamlit) (5.2.2)
WARNING: You are using pip version 21.0; however, version 21.2.4 is available.
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.


import streamlit

	month	town	flat_type	block	street_name	storey_range	floor_area_sqm	flat_model	lease_commence_date	resale_price	remaining_lease
80372	2020-09-01	YISHUN	EXECUTIVE	791	YISHUN AVE 2	04 TO 06	146.0	Maisonette	1987	558000.0	66 years 03 months
78711	2020-09-01	HOUGANG	4 ROOM	516	HOUGANG AVE 10	04 TO 06	104.0	Model A	1986	490000.0	64 years 11 months
78718	2020-09-01	HOUGANG	4 ROOM	334	HOUGANG AVE 5	04 TO 06	104.0	Model A	1983	368000.0	61 years 07 months
78717	2020-09-01	HOUGANG	4 ROOM	603	HOUGANG AVE 4	04 TO 06	104.0	Model A	1985	375000.0	64 years 01 month
78716	2020-09-01	HOUGANG	4 ROOM	602	HOUGANG AVE 4	07 TO 09	84.0	Simplified	1985	345000.0	64 years

Objectives¶

About the dataset:¶

In this notebook:¶

1. Loading of datasets¶

2. Data cleaning and Preprocessing¶

Types of HDB flat model in Singapore¶

3. Visual Exploratory Data Analysis¶

3.1 Proportion of flat type ownership over the past 30 years (1990 - 2020)¶

3.2 Proportion of flat type ownership over the past 5 years (2015 - 2020)¶

3.3 Median resale price of a 4 room flat over the years vs 5 room flat.¶

4. Data preparation and feature engineering/selection¶

encoding flat_type,storey_range¶

5. Model selection and training¶

5.1 Checking if Linear Regression yields good results.¶

Using Decision Tree¶

5.Random Forest and hyperparameter optimization.¶

5.2 Feature Importance¶

5.3 Using Random Forest with GridSearchCV.¶

5.4 Random forest with hyperparameter optimization¶

6. Conclusion and things to improve/work on.¶

	month	town	flat_type	block	street_name	storey_range	floor_area_sqm	flat_model	lease_commence_date	resale_price
0	2012-03	ANG MO KIO	2 ROOM	172	ANG MO KIO AVE 4	06 TO 10	45.0	Improved	1986	250000.0
1	2012-03	ANG MO KIO	2 ROOM	510	ANG MO KIO AVE 8	01 TO 05	44.0	Improved	1980	265000.0
2	2012-03	ANG MO KIO	3 ROOM	610	ANG MO KIO AVE 4	06 TO 10	68.0	New Generation	1980	315000.0
3	2012-03	ANG MO KIO	3 ROOM	474	ANG MO KIO AVE 10	01 TO 05	67.0	New Generation	1984	320000.0
4	2012-03	ANG MO KIO	3 ROOM	604	ANG MO KIO AVE 5	06 TO 10	67.0	New Generation	1980	321000.0

	town	year	flat_type	median_resale_price
0	ANG MO KIO	1991	1 ROOM	7000.0
1	BUKIT MERAH	1990	1 ROOM	7500.0
2	ANG MO KIO	1990	1 ROOM	8000.0
3	ANG MO KIO	1992	1 ROOM	8250.0
4	BUKIT MERAH	1991	1 ROOM	8950.0
...	...	...	...	...
3454	CENTRAL AREA	2016	5 ROOM	995000.0
3455	CENTRAL AREA	2019	5 ROOM	1000000.0
3456	QUEENSTOWN	2020	EXECUTIVE	1000000.0
3457	CENTRAL AREA	2020	5 ROOM	1040094.0
3458	QUEENSTOWN	2019	EXECUTIVE	1065000.0