Linear regression of Boston Housing Dataset using PyData

Posted on Mon 06 November 2017 in Notebook

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [2]:
url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Boston.csv'
df = pd.read_csv(url)
In [3]:
df.head()
Out[3]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

Sampling Statistic

In [4]:
df.describe()
Out[4]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 253.500000 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 146.213884 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 1.000000 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 127.250000 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 253.500000 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 379.750000 3.677082 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 506.000000 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

Correlation coefficient

In [5]:
df.corr()
Out[5]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
Unnamed: 0 1.000000 0.407407 -0.103393 0.399439 -0.003759 0.398736 -0.079971 0.203784 -0.302211 0.686002 0.666626 0.291074 -0.295041 0.258465 -0.226604
crim 0.407407 1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 -0.385064 0.455621 -0.388305
zn -0.103393 -0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 0.360445
indus 0.399439 0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 -0.483725
chas -0.003759 -0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 0.175260
nox 0.398736 0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 -0.427321
rm -0.079971 -0.219247 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.613808 0.695360
age 0.203784 0.352734 -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 -0.273534 0.602339 -0.376955
dis -0.302211 -0.379670 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.496996 0.249929
rad 0.686002 0.625505 -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.488676 -0.381626
tax 0.666626 0.582764 -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.543993 -0.468536
ptratio 0.291074 0.289946 -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.374044 -0.507787
black -0.295041 -0.385064 0.175520 -0.356977 0.048788 -0.380051 0.128069 -0.273534 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.366087 0.333461
lstat 0.258465 0.455621 -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 -0.366087 1.000000 -0.737663
medv -0.226604 -0.388305 0.360445 -0.483725 0.175260 -0.427321 0.695360 -0.376955 0.249929 -0.381626 -0.468536 -0.507787 0.333461 -0.737663 1.000000
In [6]:
np.corrcoef(df.rm, df.medv)
Out[6]:
array([[ 1.        ,  0.69535995],
       [ 0.69535995,  1.        ]])

Plotting Histogram

In [7]:
plt.title('Histogram of Crime Rate')
plt.xlabel('Per capita crime rate by town')
plt.ylabel('Frequency')
_, _, _ = plt.hist(df.crim, bins=50)

Plotting Scattergraph

In [8]:
plt.xlabel('zn')
plt.ylabel('indus')
plt.scatter(df.zn, df.indus)
Out[8]:
<matplotlib.collections.PathCollection at 0x114cfb990>
In [9]:
plt.xlabel('rm')
plt.ylabel('medv')
plt.scatter(df.rm, df.medv)
Out[9]:
<matplotlib.collections.PathCollection at 0x114ef0850>

Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
In [11]:
X, Y = df.rm.as_matrix().reshape(-1, 1), df.medv.as_matrix().reshape(-1, 1)
In [12]:
clf = LinearRegression()
In [13]:
clf.fit(X, Y)
/Users/akimach/pelican-env/lib/python2.7/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[13]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [14]:
print("Coefficient:{}, Intercept:{}".format(clf.coef_, clf.intercept_))
Coefficient:[[ 9.10210898]], Intercept:[-34.67062078]

R^2

$$R^{2}=1-\frac{\sum_{i=1}^{n}{\left( y_{i}-f_{i} \right)}}{\sum_{i}^{n}{\left( y_{i}-\overline{y} \right)}}$$

In [15]:
print("R^2 :%f" % (clf.score(X, Y)))
R^2 :0.483525
In [16]:
_x = np.arange(3, 9, 0.1)
_y = _x * clf.coef_ + clf.intercept_
_y = _y.reshape(-1)

Re-plotting

In [17]:
plt.xlabel('Average number of rooms per dwelling')
plt.ylabel('Median value of owner-occupied homes in \$1000s')
plt.scatter(df.rm, df.medv)
plt.plot(_x, _y, color='deepskyblue')
Out[17]:
[<matplotlib.lines.Line2D at 0x11576db10>]