# import the necessary packages
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import time
import cv2

import tensorflow as tf
from tensorflow.keras.datasets import mnist


(train_X, train_y), (test_X, test_y) = mnist.load_data()

print()
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))
print('X_test:  '  + str(test_X.shape))
print('Y_test:  '  + str(test_y.shape))

X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


for i in range(9):  
    plt.subplot(330 + 1 + i)
    plt.imshow(train_X[i], cmap=plt.get_cmap('gray'))
plt.show()


# Reshape the training and test examples 
train_x_flatten = train_X.reshape(train_X.shape[0], -1) #.T   
test_x_flatten  = test_X.reshape(test_X.shape[0], -1)   #.T


train_x_flatten.shape

(60000, 784)


test_x_flatten.shape

(10000, 784)


import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.figure(figsize=(12,4))

for index, (image, label) in enumerate(zip(train_x_flatten[0:5], train_y[0:5])):
    plt.subplot(1, 5, index + 1)
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title('Training: %i\n' % label, fontsize = 20);


# Python and Scikit-Learn Random Forest Classifier

print("SEARCHING Random Forest Classifier: ")
params = {"n_estimators": [100, 200, 300, 500]}
print(params)

start = time.time()
gs = GridSearchCV(RandomForestClassifier(), params, n_jobs = -1, verbose = 1)
gs.fit(train_x_flatten, train_y)

# print diagnostic information to the user and grab the best model
print("Jobs done in seconds : ", (time.time() - start))
print("Best score : ", (gs.best_score_))
print("Random Forest Classifier PARAMETERS")
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
	print(p, bestParams[p])

SEARCHING Random Forest Classifier: 
{'n_estimators': [100, 200, 300, 500]}
Fitting 5 folds for each of 4 candidates, totalling 20 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:  1.6min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.6min finished

Jobs done in seconds :  384.4081120491028
Best score :  0.9682999999999999
Random Forest Classifier PARAMETERS
n_estimators 500


from sklearn import metrics

predictions = gs.predict(test_x_flatten)
cm = metrics.confusion_matrix(y_true=test_y, y_pred = predictions, 
                              labels = gs.classes_)
print(cm)

score = gs.score(test_x_flatten, test_y) # test score
print(); print("Accuracy score: ", score)

[[ 969    0    0    0    0    3    3    1    3    1]
 [   0 1123    3    3    0    2    2    0    1    1]
 [   5    0 1004    5    1    0    3    8    6    0]
 [   0    0   10  976    0    6    0    9    8    1]
 [   1    0    2    0  957    0    4    1    2   15]
 [   2    0    0   11    4  861    6    2    4    2]
 [   5    3    0    0    3    3  940    0    4    0]
 [   1    2   19    0    0    0    0  995    1   10]
 [   4    0    6    7    3    5    4    4  930   11]
 [   6    5    2    8   10    3    1    4    6  964]]

Accuracy score:  0.9719


import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, 
            linewidths=.5, square = True, cmap = 'Blues_r', fmt='d');

plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title)

Text(0.5, 1.0, 'Accuracy Score: 0.9719')


# Python and Scikit-Learn Random Forest Classifier

print("SEARCHING Logistic Regression Classifier: ")
params = {"C": [1.0, 10.0, 100.0]}
print(params)

start = time.time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs = -1, verbose = 1)
gs.fit(train_x_flatten, train_y)

# print diagnostic information to the user and grab the best model
print("Jobs done in seconds : ", (time.time() - start))
print("Best score : ", (gs.best_score_))
print("Logistic Regression PARAMETERS")
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
	print(p, bestParams[p])

SEARCHING Logistic Regression Classifier: 
{'C': [1.0, 10.0, 100.0]}
Fitting 5 folds for each of 3 candidates, totalling 15 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   26.2s finished

Jobs done in seconds :  31.161186695098877
Best score :  0.9182166666666666
Logistic Regression PARAMETERS
C 10.0


from sklearn import metrics

predictions = gs.predict(test_x_flatten)
cm = metrics.confusion_matrix(y_true=test_y, y_pred = predictions, 
                              labels = gs.classes_)
print(cm)

score = gs.score(test_x_flatten, test_y) # test score
print(); print("Accuracy score: ", score)

[[ 961    0    1    3    1    3    5    4    2    0]
 [   0 1113    3    3    0    1    3    2   10    0]
 [   4   10  924   16    5    4   14    8   44    3]
 [   4    1   20  917    1   24    4    9   23    7]
 [   1    1    7    3  907    0    9    8   10   36]
 [  11    2    2   35   11  772   12    7   34    6]
 [   9    3    7    3    8   17  908    2    1    0]
 [   1    6   24    7    7    1    0  949    3   30]
 [   8    8    7   24    6   23   10   10  869    9]
 [   9    7    0   11   24    6    0   22    7  923]]

Accuracy score:  0.9243


import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, 
            linewidths=.5, square = True, cmap = 'Blues_r', fmt='d');

plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title)

Text(0.5, 1.0, 'Accuracy Score: 0.9243')

Computer Vision & Image Processing using Python and OpenCV¶

Computer Vision for Beginners: How to classify image using RFC (Random Forest Classifier) and LR (Logistic Regression) pipeline¶

Confusion Matrix¶

Confusion Matrix¶

Summary¶