Learn by Coding Examples in Applied Machine Learning

Regression Machine Learning Algorithms in Python with scikit-learn

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

Linear Regression

In [2]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = LinearRegression()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-4.004946635323989

Ridge Regression

In [3]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import Ridge

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = Ridge()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-3.9188853533369974

Lasso Regression

In [4]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import Lasso
# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = Lasso()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-4.067915451268757

ElasticNet Regression

In [5]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import ElasticNet

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = ElasticNet()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-3.90172818770532

KNN Regression

In [6]:
import pandas
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = KNeighborsRegressor()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-7.210794509803921

Decision Tree Regression

In [7]:
import pandas
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = DecisionTreeRegressor()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-3.9955725490196072

SVM Regression

In [8]:
import pandas
from sklearn import model_selection
from sklearn.svm import SVR

# load data
filename = 'housing.csv'

names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

dataframe = pandas.read_csv(filename, names=names, delim_whitespace=True)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

seed = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)

model = SVR()

scoring = 'neg_mean_absolute_error'

results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

print(); print(results.mean())
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

-6.828054989470113
In [ ]: