Machine Learning for Binary Classification

In [7]:
import warnings
warnings.filterwarnings("ignore")

# Classification Project: Sonar rocks or mines

# Load libraries
from matplotlib import pyplot
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Load dataset
url = 'sonar.all-data.csv'
dataset = read_csv(url, header=None)


# Summarize Data

# Descriptive statistics
# shape
print(dataset.shape)
# types
set_option('display.max_rows', 500)
print(dataset.dtypes)
# head
set_option('display.width', 100)
print(dataset.head(20))
# descriptions, change precision to 3 places
set_option('precision', 3)
print(dataset.describe())
# class distribution
print(dataset.groupby(60).size())


# Data visualizations

# histograms
dataset.hist(figsize = (12,12))
pyplot.show()
# density
dataset.plot(kind='density', subplots=True, layout=(8,8), sharex=False, legend=False, figsize = (12,12))
pyplot.show()

# scatter plot matrix
scatter_matrix(dataset, figsize = (12,12))
pyplot.show()

# correlation matrix
fig = pyplot.figure(figsize = (12,12))
ax = fig.add_subplot(111)
cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
pyplot.show()

# Prepare Data

# Split-out validation dataset
array = dataset.values
X = array[:,0:60].astype(float)
Y = array[:,60]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)


# Evaluate Algorithms

# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
results = []
names = []
for name, model in models:
	kfold = KFold(n_splits=num_folds, random_state=seed)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

# Compare Algorithms
fig = pyplot.figure(figsize = (12,12))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()


# Standardize the dataset
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC())])))
results = []
names = []
for name, model in pipelines:
	kfold = KFold(n_splits=num_folds, random_state=seed)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

# Compare Algorithms
fig = pyplot.figure(figsize = (12,12))
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()


# Tune scaled KNN
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
neighbors = [1,3,5,7,9,11,13,15,17,19,21]
param_grid = dict(n_neighbors=neighbors)
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


# Tune scaled SVM
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


# ensembles
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))
results = []
names = []
for name, model in ensembles:
	kfold = KFold(n_splits=num_folds, random_state=seed)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

# Compare Algorithms
fig = pyplot.figure(figsize = (12,12))
fig.suptitle('Ensemble Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()



# Finalize Model

# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = SVC(C=1.5)
model.fit(rescaledX, Y_train)

# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
(208, 61)
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
32    float64
33    float64
34    float64
35    float64
36    float64
37    float64
38    float64
39    float64
40    float64
41    float64
42    float64
43    float64
44    float64
45    float64
46    float64
47    float64
48    float64
49    float64
50    float64
51    float64
52    float64
53    float64
54    float64
55    float64
56    float64
57    float64
58    float64
59    float64
60     object
dtype: object
       0      1      2      3      4      5      6      7      8      9  ...     51     52     53  \
0   0.020  0.037  0.043  0.021  0.095  0.099  0.154  0.160  0.311  0.211 ...  0.003  0.006  0.016   
1   0.045  0.052  0.084  0.069  0.118  0.258  0.216  0.348  0.334  0.287 ...  0.008  0.009  0.005   
2   0.026  0.058  0.110  0.108  0.097  0.228  0.243  0.377  0.560  0.619 ...  0.023  0.017  0.009   
3   0.010  0.017  0.062  0.021  0.021  0.037  0.110  0.128  0.060  0.126 ...  0.012  0.004  0.015   
4   0.076  0.067  0.048  0.039  0.059  0.065  0.121  0.247  0.356  0.446 ...  0.003  0.005  0.011   
5   0.029  0.045  0.028  0.017  0.038  0.099  0.120  0.183  0.210  0.304 ...  0.004  0.001  0.004   
6   0.032  0.096  0.132  0.141  0.167  0.171  0.073  0.140  0.208  0.351 ...  0.020  0.025  0.013   
7   0.052  0.055  0.084  0.032  0.116  0.092  0.103  0.061  0.146  0.284 ...  0.008  0.012  0.004   
8   0.022  0.037  0.048  0.048  0.065  0.059  0.075  0.010  0.068  0.149 ...  0.015  0.013  0.015   
9   0.016  0.017  0.035  0.007  0.019  0.067  0.106  0.070  0.096  0.025 ...  0.009  0.022  0.018   
10  0.004  0.006  0.015  0.034  0.031  0.028  0.040  0.027  0.032  0.045 ...  0.006  0.012  0.005   
11  0.012  0.031  0.017  0.031  0.036  0.010  0.018  0.058  0.112  0.084 ...  0.013  0.026  0.022   
12  0.008  0.009  0.005  0.025  0.034  0.055  0.053  0.096  0.101  0.124 ...  0.018  0.013  0.009   
13  0.009  0.006  0.025  0.049  0.120  0.159  0.139  0.099  0.096  0.190 ...  0.006  0.009  0.019   
14  0.012  0.043  0.060  0.045  0.060  0.035  0.053  0.034  0.105  0.212 ...  0.008  0.006  0.017   
15  0.030  0.061  0.065  0.092  0.162  0.229  0.218  0.203  0.146  0.085 ...  0.003  0.015  0.007   
16  0.035  0.012  0.019  0.047  0.074  0.118  0.168  0.154  0.147  0.291 ...  0.035  0.016  0.015   
17  0.019  0.061  0.038  0.077  0.139  0.081  0.057  0.022  0.104  0.119 ...  0.033  0.013  0.012   
18  0.027  0.009  0.015  0.028  0.041  0.076  0.103  0.114  0.079  0.152 ...  0.008  0.001  0.002   
19  0.013  0.015  0.064  0.173  0.257  0.256  0.295  0.411  0.498  0.592 ...  0.009  0.004  0.010   

       54         55     56         57         58     59  60  
0   0.007  1.670e-02  0.018  8.400e-03  9.000e-03  0.003   R  
1   0.009  1.910e-02  0.014  4.900e-03  5.200e-03  0.004   R  
2   0.018  2.440e-02  0.032  1.640e-02  9.500e-03  0.008   R  
3   0.009  7.300e-03  0.005  4.400e-03  4.000e-03  0.012   R  
4   0.011  1.500e-03  0.007  4.800e-03  1.070e-02  0.009   R  
5   0.001  8.900e-03  0.006  2.700e-03  5.100e-03  0.006   R  
6   0.007  1.380e-02  0.009  1.430e-02  3.600e-03  0.010   R  
7   0.012  9.700e-03  0.009  4.700e-03  4.800e-03  0.005   R  
8   0.006  4.900e-03  0.006  9.300e-03  5.900e-03  0.002   R  
9   0.008  6.800e-03  0.003  3.500e-03  5.600e-03  0.004   R  
10  0.006  9.300e-03  0.004  3.000e-04  5.300e-03  0.004   R  
11  0.007  1.180e-02  0.003  9.200e-03  9.000e-04  0.004   R  
12  0.010  1.900e-03  0.006  5.800e-03  5.900e-03  0.003   R  
13  0.008  1.520e-02  0.016  5.300e-03  1.890e-02  0.010   R  
14  0.019  5.400e-03  0.011  1.960e-02  1.470e-02  0.006   R  
15  0.021  7.600e-03  0.015  4.900e-03  2.000e-02  0.007   R  
16  0.011  4.800e-03  0.009  1.500e-03  7.300e-03  0.007   R  
17  0.011  2.400e-03  0.004  3.700e-03  1.120e-02  0.007   R  
18  0.007  3.900e-03  0.012  1.320e-02  7.000e-03  0.009   R  
19  0.012  6.000e-04  0.018  9.400e-03  1.160e-02  0.006   R  

[20 rows x 61 columns]
            0          1        2        3        4        5        6        7        8        9   \
count  208.000  2.080e+02  208.000  208.000  208.000  208.000  208.000  208.000  208.000  208.000   
mean     0.029  3.844e-02    0.044    0.054    0.075    0.105    0.122    0.135    0.178    0.208   
std      0.023  3.296e-02    0.038    0.047    0.056    0.059    0.062    0.085    0.118    0.134   
min      0.002  6.000e-04    0.002    0.006    0.007    0.010    0.003    0.005    0.007    0.011   
25%      0.013  1.645e-02    0.019    0.024    0.038    0.067    0.081    0.080    0.097    0.111   
50%      0.023  3.080e-02    0.034    0.044    0.062    0.092    0.107    0.112    0.152    0.182   
75%      0.036  4.795e-02    0.058    0.065    0.100    0.134    0.154    0.170    0.233    0.269   
max      0.137  2.339e-01    0.306    0.426    0.401    0.382    0.373    0.459    0.683    0.711   

         ...           50         51         52       53         54         55         56  \
count    ...      208.000  2.080e+02  2.080e+02  208.000  2.080e+02  2.080e+02  2.080e+02   
mean     ...        0.016  1.342e-02  1.071e-02    0.011  9.290e-03  8.222e-03  7.820e-03   
std      ...        0.012  9.634e-03  7.060e-03    0.007  7.088e-03  5.736e-03  5.785e-03   
min      ...        0.000  8.000e-04  5.000e-04    0.001  6.000e-04  4.000e-04  3.000e-04   
25%      ...        0.008  7.275e-03  5.075e-03    0.005  4.150e-03  4.400e-03  3.700e-03   
50%      ...        0.014  1.140e-02  9.550e-03    0.009  7.500e-03  6.850e-03  5.950e-03   
75%      ...        0.021  1.673e-02  1.490e-02    0.015  1.210e-02  1.058e-02  1.043e-02   
max      ...        0.100  7.090e-02  3.900e-02    0.035  4.470e-02  3.940e-02  3.550e-02   

              57         58         59  
count  2.080e+02  2.080e+02  2.080e+02  
mean   7.949e-03  7.941e-03  6.507e-03  
std    6.470e-03  6.181e-03  5.031e-03  
min    3.000e-04  1.000e-04  6.000e-04  
25%    3.600e-03  3.675e-03  3.100e-03  
50%    5.800e-03  6.400e-03  5.300e-03  
75%    1.035e-02  1.033e-02  8.525e-03  
max    4.400e-02  3.640e-02  4.390e-02  

[8 rows x 60 columns]
60
M    111
R     97
dtype: int64
LR: 0.782721 (0.093796)
LDA: 0.746324 (0.117854)
KNN: 0.808088 (0.067507)
CART: 0.741544 (0.103740)
NB: 0.648897 (0.141868)
SVM: 0.608824 (0.118656)
ScaledLR: 0.734191 (0.095885)
ScaledLDA: 0.746324 (0.117854)
ScaledKNN: 0.825735 (0.054511)
ScaledCART: 0.758824 (0.092866)
ScaledNB: 0.648897 (0.141868)
ScaledSVM: 0.836397 (0.088697)
Best: 0.849398 using {'n_neighbors': 1}
0.849398 (0.059881) with: {'n_neighbors': 1}
0.837349 (0.066303) with: {'n_neighbors': 3}
0.837349 (0.037500) with: {'n_neighbors': 5}
0.765060 (0.089510) with: {'n_neighbors': 7}
0.753012 (0.086979) with: {'n_neighbors': 9}
0.734940 (0.104890) with: {'n_neighbors': 11}
0.734940 (0.105836) with: {'n_neighbors': 13}
0.728916 (0.075873) with: {'n_neighbors': 15}
0.710843 (0.078716) with: {'n_neighbors': 17}
0.722892 (0.084555) with: {'n_neighbors': 19}
0.710843 (0.108829) with: {'n_neighbors': 21}
Best: 0.867470 using {'C': 1.5, 'kernel': 'rbf'}
0.759036 (0.098863) with: {'C': 0.1, 'kernel': 'linear'}
0.530120 (0.118780) with: {'C': 0.1, 'kernel': 'poly'}
0.572289 (0.130339) with: {'C': 0.1, 'kernel': 'rbf'}
0.704819 (0.066360) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.746988 (0.108913) with: {'C': 0.3, 'kernel': 'linear'}
0.644578 (0.132290) with: {'C': 0.3, 'kernel': 'poly'}
0.765060 (0.092312) with: {'C': 0.3, 'kernel': 'rbf'}
0.734940 (0.054631) with: {'C': 0.3, 'kernel': 'sigmoid'}
0.740964 (0.083035) with: {'C': 0.5, 'kernel': 'linear'}
0.680723 (0.098638) with: {'C': 0.5, 'kernel': 'poly'}
0.789157 (0.064316) with: {'C': 0.5, 'kernel': 'rbf'}
0.746988 (0.059265) with: {'C': 0.5, 'kernel': 'sigmoid'}
0.746988 (0.084525) with: {'C': 0.7, 'kernel': 'linear'}
0.740964 (0.127960) with: {'C': 0.7, 'kernel': 'poly'}
0.813253 (0.084886) with: {'C': 0.7, 'kernel': 'rbf'}
0.753012 (0.058513) with: {'C': 0.7, 'kernel': 'sigmoid'}
0.759036 (0.096940) with: {'C': 0.9, 'kernel': 'linear'}
0.771084 (0.102127) with: {'C': 0.9, 'kernel': 'poly'}
0.837349 (0.087854) with: {'C': 0.9, 'kernel': 'rbf'}
0.753012 (0.073751) with: {'C': 0.9, 'kernel': 'sigmoid'}
0.753012 (0.099230) with: {'C': 1.0, 'kernel': 'linear'}
0.789157 (0.107601) with: {'C': 1.0, 'kernel': 'poly'}
0.837349 (0.087854) with: {'C': 1.0, 'kernel': 'rbf'}
0.753012 (0.070213) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.771084 (0.106063) with: {'C': 1.3, 'kernel': 'linear'}
0.819277 (0.106414) with: {'C': 1.3, 'kernel': 'poly'}
0.849398 (0.079990) with: {'C': 1.3, 'kernel': 'rbf'}
0.710843 (0.076865) with: {'C': 1.3, 'kernel': 'sigmoid'}
0.759036 (0.091777) with: {'C': 1.5, 'kernel': 'linear'}
0.831325 (0.109499) with: {'C': 1.5, 'kernel': 'poly'}
0.867470 (0.090883) with: {'C': 1.5, 'kernel': 'rbf'}
0.740964 (0.063717) with: {'C': 1.5, 'kernel': 'sigmoid'}
0.746988 (0.090228) with: {'C': 1.7, 'kernel': 'linear'}
0.831325 (0.115695) with: {'C': 1.7, 'kernel': 'poly'}
0.861446 (0.087691) with: {'C': 1.7, 'kernel': 'rbf'}
0.710843 (0.088140) with: {'C': 1.7, 'kernel': 'sigmoid'}
0.759036 (0.094276) with: {'C': 2.0, 'kernel': 'linear'}
0.831325 (0.108279) with: {'C': 2.0, 'kernel': 'poly'}
0.867470 (0.094701) with: {'C': 2.0, 'kernel': 'rbf'}
0.728916 (0.095050) with: {'C': 2.0, 'kernel': 'sigmoid'}
AB: 0.813971 (0.066017)
GBM: 0.853676 (0.100318)
RF: 0.734191 (0.084895)
ET: 0.819118 (0.094648)
0.8571428571428571
[[23  4]
 [ 2 13]]
              precision    recall  f1-score   support

           M       0.92      0.85      0.88        27
           R       0.76      0.87      0.81        15

   micro avg       0.86      0.86      0.86        42
   macro avg       0.84      0.86      0.85        42
weighted avg       0.86      0.86      0.86        42

In [ ]: