# import the necessary packages
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import time
import cv2
import tensorflow as tf
from tensorflow.keras.datasets import mnist
(train_X, train_y), (test_X, test_y) = mnist.load_data()
print()
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))
print('X_test: ' + str(test_X.shape))
print('Y_test: ' + str(test_y.shape))
X_train: (60000, 28, 28) Y_train: (60000,) X_test: (10000, 28, 28) Y_test: (10000,)
for i in range(9):
plt.subplot(330 + 1 + i)
plt.imshow(train_X[i], cmap=plt.get_cmap('gray'))
plt.show()
# Reshape the training and test examples
train_x_flatten = train_X.reshape(train_X.shape[0], -1) #.T
test_x_flatten = test_X.reshape(test_X.shape[0], -1) #.T
train_x_flatten.shape
(60000, 784)
test_x_flatten.shape
(10000, 784)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(12,4))
for index, (image, label) in enumerate(zip(train_x_flatten[0:5], train_y[0:5])):
plt.subplot(1, 5, index + 1)
plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
plt.title('Training: %i\n' % label, fontsize = 20);
# Python and Scikit-Learn Random Forest Classifier
print("SEARCHING Random Forest Classifier: ")
params = {"n_estimators": [100, 200, 300, 500]}
print(params)
start = time.time()
gs = GridSearchCV(RandomForestClassifier(), params, n_jobs = -1, verbose = 1)
gs.fit(train_x_flatten, train_y)
# print diagnostic information to the user and grab the best model
print("Jobs done in seconds : ", (time.time() - start))
print("Best score : ", (gs.best_score_))
print("Random Forest Classifier PARAMETERS")
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print(p, bestParams[p])
SEARCHING Random Forest Classifier: {'n_estimators': [100, 200, 300, 500]} Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers. [Parallel(n_jobs=-1)]: Done 10 out of 20 | elapsed: 1.6min remaining: 1.6min [Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 3.6min finished
Jobs done in seconds : 384.4081120491028 Best score : 0.9682999999999999 Random Forest Classifier PARAMETERS n_estimators 500
from sklearn import metrics
predictions = gs.predict(test_x_flatten)
cm = metrics.confusion_matrix(y_true=test_y, y_pred = predictions,
labels = gs.classes_)
print(cm)
score = gs.score(test_x_flatten, test_y) # test score
print(); print("Accuracy score: ", score)
[[ 969 0 0 0 0 3 3 1 3 1] [ 0 1123 3 3 0 2 2 0 1 1] [ 5 0 1004 5 1 0 3 8 6 0] [ 0 0 10 976 0 6 0 9 8 1] [ 1 0 2 0 957 0 4 1 2 15] [ 2 0 0 11 4 861 6 2 4 2] [ 5 3 0 0 3 3 940 0 4 0] [ 1 2 19 0 0 0 0 995 1 10] [ 4 0 6 7 3 5 4 4 930 11] [ 6 5 2 8 10 3 1 4 6 964]] Accuracy score: 0.9719
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True,
linewidths=.5, square = True, cmap = 'Blues_r', fmt='d');
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title)
Text(0.5, 1.0, 'Accuracy Score: 0.9719')
# Python and Scikit-Learn Random Forest Classifier
print("SEARCHING Logistic Regression Classifier: ")
params = {"C": [1.0, 10.0, 100.0]}
print(params)
start = time.time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs = -1, verbose = 1)
gs.fit(train_x_flatten, train_y)
# print diagnostic information to the user and grab the best model
print("Jobs done in seconds : ", (time.time() - start))
print("Best score : ", (gs.best_score_))
print("Logistic Regression PARAMETERS")
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print(p, bestParams[p])
SEARCHING Logistic Regression Classifier: {'C': [1.0, 10.0, 100.0]} Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers. [Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 26.2s finished
Jobs done in seconds : 31.161186695098877 Best score : 0.9182166666666666 Logistic Regression PARAMETERS C 10.0
from sklearn import metrics
predictions = gs.predict(test_x_flatten)
cm = metrics.confusion_matrix(y_true=test_y, y_pred = predictions,
labels = gs.classes_)
print(cm)
score = gs.score(test_x_flatten, test_y) # test score
print(); print("Accuracy score: ", score)
[[ 961 0 1 3 1 3 5 4 2 0] [ 0 1113 3 3 0 1 3 2 10 0] [ 4 10 924 16 5 4 14 8 44 3] [ 4 1 20 917 1 24 4 9 23 7] [ 1 1 7 3 907 0 9 8 10 36] [ 11 2 2 35 11 772 12 7 34 6] [ 9 3 7 3 8 17 908 2 1 0] [ 1 6 24 7 7 1 0 949 3 30] [ 8 8 7 24 6 23 10 10 869 9] [ 9 7 0 11 24 6 0 22 7 923]] Accuracy score: 0.9243
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True,
linewidths=.5, square = True, cmap = 'Blues_r', fmt='d');
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title)
Text(0.5, 1.0, 'Accuracy Score: 0.9243')
In this coding recipe, we discussed How to classify image using RFC (Random Forest Classifier) and LR (Logistic Regression) pipeline.
Specifically, we have learned the followings: