Linear regression of Boston Housing Dataset using PyData
Posted on Mon 06 November 2017 in Notebook
PyData family :
In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Load Dataset¶
In [2]:
url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Boston.csv'
df = pd.read_csv(url)
In [3]:
df.head()
Out[3]:
Sampling Statistic¶
In [4]:
df.describe()
Out[4]:
Correlation coefficient¶
In [5]:
df.corr()
Out[5]:
In [6]:
np.corrcoef(df.rm, df.medv)
Out[6]:
Plotting Histogram¶
In [7]:
plt.title('Histogram of Crime Rate')
plt.xlabel('Per capita crime rate by town')
plt.ylabel('Frequency')
_, _, _ = plt.hist(df.crim, bins=50)
Plotting Scattergraph¶
In [8]:
plt.xlabel('zn')
plt.ylabel('indus')
plt.scatter(df.zn, df.indus)
Out[8]:
In [9]:
plt.xlabel('rm')
plt.ylabel('medv')
plt.scatter(df.rm, df.medv)
Out[9]:
Linear Regression¶
In [10]:
from sklearn.linear_model import LinearRegression
In [11]:
X, Y = df.rm.as_matrix().reshape(-1, 1), df.medv.as_matrix().reshape(-1, 1)
In [12]:
clf = LinearRegression()
In [13]:
clf.fit(X, Y)
Out[13]:
In [14]:
print("Coefficient:{}, Intercept:{}".format(clf.coef_, clf.intercept_))
R^2¶
$$R^{2}=1-\frac{\sum_{i=1}^{n}{\left( y_{i}-f_{i} \right)}}{\sum_{i}^{n}{\left( y_{i}-\overline{y} \right)}}$$
In [15]:
print("R^2 :%f" % (clf.score(X, Y)))
In [16]:
_x = np.arange(3, 9, 0.1)
_y = _x * clf.coef_ + clf.intercept_
_y = _y.reshape(-1)
Re-plotting¶
In [17]:
plt.xlabel('Average number of rooms per dwelling')
plt.ylabel('Median value of owner-occupied homes in \$1000s')
plt.scatter(df.rm, df.medv)
plt.plot(_x, _y, color='deepskyblue')
Out[17]: