In [1]:
## @author: 
##     Nilimesh Halder, PhD
##     BSc in Computer Science and Engineering, 
##         @ Khulna University, Bangladesh.
##     PhD in Artificial Intelligence and Applied Machine Learning, 
##         @ The University of Western Australia, Australia.
In [2]:
# Disclaimer :
### The information and recipe presented within this Data Science Recipe is only for educational and coaching purposes for beginners and app-developers. 
### Anyone can practice and apply the recipe presented here, but the reader is taking full responsibility for his/her actions.
### The author of this recipe (code / program) has made every effort to ensure the accuracy of the information was correct at time of publication. 
### The author does not assume and hereby disclaims any liability to any party for any loss, damage, or disruption caused by errors or omissions, 
### whether such errors or omissions result from accident, negligence, or any other cause. 
### Some of the information presented here could be also found in public knowledge domains.
In [3]:
# -----------------------------------------------------------------------------
# Steps in Applied Machine Learning & Data Science :
# -----------------------------------------------------------------------------
# 1. Load Library
# 2. Load Dataset to which Machine Learning Algorithm to be applied
#    Either a) load from a CSV file or b) load from a Database   
# 3. Summarisation of Data to understand dataset (Descriptive Statistics)
# 4. Visualisation of Data to understand dataset (Plots, Graphs etc.)
# 5. Data pre-processing & Data transformation (split into train-test datasets)
# 6. Application of a Machine Learning Algorithm to training dataset 
#   a) setup a ML algorithm and parameter settings
#   b) cross validation setup with training dataset
#   c) training & fitting Algorithm with training Dataset
#   d) evaluation of trained Algorithm (or Model) and result
#   e) saving the trained model for future prediction
# 7. Load the saved model and apply it to new dataset for prediction            
# -----------------------------------------------------------------------------
In [4]:
# -------------------------------------
# 1. Load necessary libraries
# -------------------------------------
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
import os
import turicreate as tc
In [5]:
print('\nEnd-to-End Applied Machine Learning and Data Science Recipe for Beginners & Business Analysts \n\n')
print(os.getcwd())
filename = 'iris.data.csv'
print("The INPUT DataFile: ", filename)
End-to-End Applied Machine Learning and Data Science Recipe for Beginners & Business Analysts 


/Users/nilimesh/Desktop/Data Science Products/Practical-Data-Science-with-Python-and-IRIS-Dataset
The INPUT DataFile:  iris.data.csv
In [6]:
# -------------------------------------------------------------------------
# 2. Load Dataset to which Machine Learning Algorithm to be applied
# -------------------------------------------------------------------------
print(filename)
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

dataset = pd.read_csv(filename, names=col_names)

# Print dataset properties
print(dataset.shape)
print(dataset.columns)
print(dataset.head())
print(dataset.tail())
iris.data.csv
(150, 5)
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'], dtype='object')
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
     sepal_length  sepal_width  petal_length  petal_width           class
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica
In [7]:
# ---------------------------------------------------------------------------
# 3. Summarisation of Data to understand dataset (Descriptive Statistics)
# ---------------------------------------------------------------------------

cols1 = ['sepal_length','sepal_width','petal_length','petal_width']
# shape
print(dataset[cols1].shape)

# head
print(dataset[cols1].head(5))

# descriptions
print(dataset[cols1].describe())
    
# class distribution
print(dataset.groupby('class').size())
(150, 4)
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64
In [8]:
# ---------------------------------------------------------------------
# 4. Visualisation of Data to understand dataset (Plots, Graphs etc.)
# ---------------------------------------------------------------------

cols1 = ['sepal_length','sepal_width','petal_length','petal_width']
    
# box and whisker plots
print()
print("box and whisker plots")
dataset[cols1].plot(kind='box', subplots=True, layout=(1,4), sharex=False, sharey=False)
pyplot.show()

# histograms
print()
print("histograms")
dataset[cols1].hist()
pyplot.show()

# scatter plot matrix
print()
print("scatter plot matrix")
scatter_matrix(dataset[cols1])
pyplot.show()
box and whisker plots
histograms
scatter plot matrix
In [9]:
# -----------------------------------------------------------------------------------
# 5. Data pre-processing and Data transformation (split into train-test datasets)
# -----------------------------------------------------------------------------------
    
# Convert Pandas DataFrame to Turi-Create DataFrame
dataset = tc.SFrame(dataset)
    
# train-test split of dataset
train_Data, test_Data = dataset.random_split(0.67)

# print shapes of train_Data, test_Data
print('Train Data:: '); print(train_Data.shape); print(train_Data.head())
print('Test Data:: '); print(test_Data.shape); print(test_Data.head())
Train Data:: 
(107, 5)
+--------------+-------------+--------------+-------------+-------------+
| sepal_length | sepal_width | petal_length | petal_width |    class    |
+--------------+-------------+--------------+-------------+-------------+
|     5.1      |     3.5     |     1.4      |     0.2     | Iris-setosa |
|     4.9      |     3.0     |     1.4      |     0.2     | Iris-setosa |
|     4.7      |     3.2     |     1.3      |     0.2     | Iris-setosa |
|     4.6      |     3.4     |     1.4      |     0.3     | Iris-setosa |
|     5.0      |     3.4     |     1.5      |     0.2     | Iris-setosa |
|     4.9      |     3.1     |     1.5      |     0.1     | Iris-setosa |
|     5.4      |     3.7     |     1.5      |     0.2     | Iris-setosa |
|     4.8      |     3.4     |     1.6      |     0.2     | Iris-setosa |
|     4.3      |     3.0     |     1.1      |     0.1     | Iris-setosa |
|     5.8      |     4.0     |     1.2      |     0.2     | Iris-setosa |
+--------------+-------------+--------------+-------------+-------------+
[10 rows x 5 columns]

Test Data:: 
(43, 5)
+--------------+-------------+--------------+-------------+-------------+
| sepal_length | sepal_width | petal_length | petal_width |    class    |
+--------------+-------------+--------------+-------------+-------------+
|     4.6      |     3.1     |     1.5      |     0.2     | Iris-setosa |
|     5.0      |     3.6     |     1.4      |     0.2     | Iris-setosa |
|     5.4      |     3.9     |     1.7      |     0.4     | Iris-setosa |
|     4.4      |     2.9     |     1.4      |     0.2     | Iris-setosa |
|     4.8      |     3.0     |     1.4      |     0.1     | Iris-setosa |
|     5.4      |     3.9     |     1.3      |     0.4     | Iris-setosa |
|     5.7      |     3.8     |     1.7      |     0.3     | Iris-setosa |
|     5.1      |     3.8     |     1.5      |     0.3     | Iris-setosa |
|     5.1      |     3.3     |     1.7      |     0.5     | Iris-setosa |
|     4.8      |     3.1     |     1.6      |     0.2     | Iris-setosa |
+--------------+-------------+--------------+-------------+-------------+
[10 rows x 5 columns]

In [10]:
# ---------------------------------------------------------------------
# 6. Application of a Machine Learning Algorithm to training dataset 
# ---------------------------------------------------------------------

# 6.1 - logistic classifier
model_lc = tc.logistic_classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])
print(model_lc)
    
predictions = model_lc.classify(test_Data)
print("PREDICTIONS: \n", predictions)
    
results_lc = model_lc.evaluate(test_Data)    
print("EVALUATION: \n", results_lc)

# Accuracy
print("\nAccuracy: ", results_lc['accuracy'])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Logistic regression:
--------------------------------------------------------
Number of examples          : 99
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
Number of coefficients      : 10
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training Accuracy | Validation Accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 0         | 1        | 1.036563     | 0.363636          | 0.250000            |
| 1         | 2        | 1.045201     | 0.848485          | 1.000000            |
| 2         | 3        | 1.045592     | 0.909091          | 1.000000            |
| 3         | 4        | 1.045953     | 0.979798          | 1.000000            |
| 4         | 5        | 1.046327     | 0.979798          | 1.000000            |
| 5         | 6        | 1.046645     | 0.979798          | 1.000000            |
| 6         | 7        | 1.047025     | 0.979798          | 1.000000            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 10
Number of examples             : 99
Number of classes              : 3
Number of feature columns      : 4
Number of unpacked features    : 4

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 1.0473

Settings
--------
Log-likelihood                 : 8.4623

Highest Positive Coefficients
-----------------------------
petal_width                    : 10.3387
petal_width                    : 4.5522
petal_length                   : 3.6962
petal_length                   : 0.8587
sepal_length                   : 0.7323

Lowest Negative Coefficients
----------------------------
(intercept)                    : -21.9523
(intercept)                    : -6.2873
sepal_width                    : -1.6373
sepal_width                    : -1.1102
sepal_length                   : -0.2819

PREDICTIONS: 
 +-------------+--------------------+
|    class    |    probability     |
+-------------+--------------------+
| Iris-setosa | 0.9846596833879551 |
| Iris-setosa | 0.9891195523049199 |
| Iris-setosa | 0.9671353755986541 |
| Iris-setosa | 0.9848143402141143 |
| Iris-setosa | 0.9884038485522262 |
| Iris-setosa | 0.976463956440169  |
| Iris-setosa | 0.9708707794568615 |
| Iris-setosa | 0.9839768490074766 |
| Iris-setosa | 0.922749236641546  |
| Iris-setosa | 0.980725021636775  |
+-------------+--------------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
EVALUATION: 
 {'accuracy': 0.9534883720930233, 'auc': 0.9984248341246308, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 5

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
|  Iris-virginica |  Iris-virginica |   16  |
|  Iris-virginica | Iris-versicolor |   1   |
| Iris-versicolor | Iris-versicolor |   13  |
| Iris-versicolor |  Iris-virginica |   1   |
+-----------------+-----------------+-------+
[5 rows x 3 columns]
, 'f1_score': 0.9565826330532213, 'log_loss': 0.11034311705728972, 'precision': 0.9565826330532213, 'recall': 0.9565826330532213, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int
	class	int

Rows: 300003

Data:
+-----------+--------------------+-----+----+----+-------+
| threshold |        fpr         | tpr | p  | n  | class |
+-----------+--------------------+-----+----+----+-------+
|    0.0    |        1.0         | 1.0 | 12 | 31 |   0   |
|   1e-05   | 0.6774193548387096 | 1.0 | 12 | 31 |   0   |
|   2e-05   | 0.6129032258064516 | 1.0 | 12 | 31 |   0   |
|   3e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   4e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   5e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   6e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   7e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   8e-05   | 0.5806451612903226 | 1.0 | 12 | 31 |   0   |
|   9e-05   | 0.5483870967741935 | 1.0 | 12 | 31 |   0   |
+-----------+--------------------+-----+----+----+-------+
[300003 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

Accuracy:  0.9534883720930233
In [11]:
# 6.2 - boosted tree classifier
model_bt = tc.boosted_trees_classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])
print(model_bt)
    
predictions = model_bt.classify(test_Data)
print("PREDICTIONS: \n", predictions)
    
results_bt = model_bt.evaluate(test_Data)    
print("EVALUATION: \n", results_bt)

# Accuracy
print("\nAccuracy: ", results_bt['accuracy'])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 100
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.006611     | 1.000000          | 0.857143            | 0.735352          | 0.820485            |
| 2         | 0.012153     | 1.000000          | 0.857143            | 0.520420          | 0.670111            |
| 3         | 0.014755     | 1.000000          | 0.857143            | 0.378747          | 0.584241            |
| 4         | 0.016736     | 1.000000          | 0.857143            | 0.280970          | 0.535130            |
| 5         | 0.018926     | 1.000000          | 0.857143            | 0.211104          | 0.510858            |
| 10        | 0.029922     | 1.000000          | 0.857143            | 0.060749          | 0.552356            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Class                          : BoostedTreesClassifier

Schema
------
Number of examples             : 100
Number of feature columns      : 4
Number of unpacked features    : 4
Number of classes              : 3

Settings
--------
Number of trees                : 30
Max tree depth                 : 6
Training time (sec)            : 0.0304
Training accuracy              : 1.0
Validation accuracy            : 0.8571
Training log_loss              : 0.0607
Validation log_loss            : 0.5524
Training auc                   : 1.0
Validation auc                 : 0.8667

PREDICTIONS: 
 +-------------+--------------------+
|    class    |    probability     |
+-------------+--------------------+
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9373205304145813 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
| Iris-setosa | 0.9491549730300903 |
+-------------+--------------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
EVALUATION: 
 {'accuracy': 0.9767441860465116, 'auc': 0.9952745023738938, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 4

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
| Iris-versicolor | Iris-versicolor |   13  |
| Iris-versicolor |  Iris-virginica |   1   |
|  Iris-virginica |  Iris-virginica |   17  |
+-----------------+-----------------+-------+
[4 rows x 3 columns]
, 'f1_score': 0.9781305114638448, 'log_loss': 0.14443349210873208, 'precision': 0.9814814814814815, 'recall': 0.9761904761904763, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int
	class	int

Rows: 300003

Data:
+-----------+-----+-----+----+----+-------+
| threshold | fpr | tpr | p  | n  | class |
+-----------+-----+-----+----+----+-------+
|    0.0    | 1.0 | 1.0 | 12 | 31 |   0   |
|   1e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   2e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   3e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   4e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   5e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   6e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   7e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   8e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   9e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
+-----------+-----+-----+----+----+-------+
[300003 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

Accuracy:  0.9767441860465116
In [12]:
# 6.3 - decision tree classifier
model_dt = tc.decision_tree_classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])
print(model_dt)
    
predictions = model_dt.classify(test_Data)
print("PREDICTIONS: \n", predictions)
    
results_dt = model_dt.evaluate(test_Data)    
print("EVALUATION: \n", results_dt)

# Accuracy
print("\nAccuracy: ", results_dt['accuracy'])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Decision tree classifier:
--------------------------------------------------------
Number of examples          : 96
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.008556     | 1.000000          | 0.818182            | 0.232097          | 0.601487            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Class                          : DecisionTreeClassifier

Schema
------
Number of examples             : 96
Number of feature columns      : 4
Number of unpacked features    : 4
Number of classes              : 3

Settings
--------
Number of trees                : 3
Max tree depth                 : 6
Training time (sec)            : 0.0089
Training accuracy              : 1.0
Validation accuracy            : 0.8182
Training log_loss              : 0.2321
Validation log_loss            : 0.6015
Training auc                   : 1.0
Validation auc                 : 0.9048

PREDICTIONS: 
 +-------------+--------------------+
|    class    |    probability     |
+-------------+--------------------+
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
| Iris-setosa | 0.8070570826530457 |
+-------------+--------------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
EVALUATION: 
 {'accuracy': 0.9534883720930233, 'auc': 0.9779476777448379, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 4

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
|  Iris-virginica |  Iris-virginica |   15  |
|  Iris-virginica | Iris-versicolor |   2   |
| Iris-versicolor | Iris-versicolor |   14  |
+-----------------+-----------------+-------+
[4 rows x 3 columns]
, 'f1_score': 0.9569444444444445, 'log_loss': 0.3241047277614516, 'precision': 0.9583333333333334, 'recall': 0.9607843137254902, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int
	class	int

Rows: 300003

Data:
+-----------+-----+-----+----+----+-------+
| threshold | fpr | tpr | p  | n  | class |
+-----------+-----+-----+----+----+-------+
|    0.0    | 1.0 | 1.0 | 12 | 31 |   0   |
|   1e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   2e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   3e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   4e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   5e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   6e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   7e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   8e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   9e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
+-----------+-----+-----+----+----+-------+
[300003 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

Accuracy:  0.9534883720930233
In [13]:
# 6.4 - nearest neighbour classifier
model_nn = tc.nearest_neighbor_classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])
print(model_nn)
    
predictions = model_nn.classify(test_Data)
print("PREDICTIONS: \n", predictions)
    
results_nn = model_nn.evaluate(test_Data)    
print("EVALUATION: \n", results_nn)

# Accuracy
print("\nAccuracy: ", results_nn['accuracy'])
Starting ball tree nearest neighbors model training.
+------------+--------------+
| Tree level | Elapsed Time |
+------------+--------------+
| 0          | 120us        |
+------------+--------------+
Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 107
Number of feature columns            : 4
Number of unpacked features          : 4
Number of distance components        : 1
Number of classes                    : 3

Training Summary
----------------
Training time (seconds)              : 19.2059

+--------------+-------------+--------------+
| Query points | % Complete. | Elapsed Time |
+--------------+-------------+--------------+
| 1            | 2.25        | 171us        |
| Done         |             | 404us        |
+--------------+-------------+--------------+
PREDICTIONS: 
 +-------------+-------------+
|    class    | probability |
+-------------+-------------+
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
| Iris-setosa |     1.0     |
+-------------+-------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
WARNING: Ignoring `roc_curve`. Not supported for multi-class classification.
+--------------+-------------+--------------+
| Query points | % Complete. | Elapsed Time |
+--------------+-------------+--------------+
| 1            | 2.25        | 149us        |
| Done         |             | 385us        |
+--------------+-------------+--------------+
+--------------+-------------+--------------+
| Query points | % Complete. | Elapsed Time |
+--------------+-------------+--------------+
| 1            | 2.25        | 153us        |
| Done         |             | 395us        |
+--------------+-------------+--------------+
EVALUATION: 
 {'accuracy': 0.9534883720930233, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 5

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|  Iris-virginica |  Iris-virginica |   16  |
|  Iris-virginica | Iris-versicolor |   1   |
| Iris-versicolor |  Iris-virginica |   1   |
|   Iris-setosa   |   Iris-setosa   |   12  |
| Iris-versicolor | Iris-versicolor |   13  |
+-----------------+-----------------+-------+
[5 rows x 3 columns]
}

Accuracy:  0.9534883720930233
In [14]:
# 6.5 - random forest classifier
model_rf = tc.random_forest_classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])
print(model_rf)
    
predictions = model_rf.classify(test_Data)
print("PREDICTIONS: \n", predictions)
    
results_rf = model_rf.evaluate(test_Data)    
print("EVALUATION: \n", results_rf)

# Accuracy
print("\nAccuracy: ", results_rf['accuracy'])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Random forest classifier:
--------------------------------------------------------
Number of examples          : 101
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.003951     | 0.960396          | 1.000000            | 0.277072          | 0.219802            |
| 2         | 0.010409     | 0.990099          | 1.000000            | 0.265747          | 0.222135            |
| 3         | 0.013858     | 0.980198          | 1.000000            | 0.272158          | 0.223682            |
| 4         | 0.016869     | 0.980198          | 1.000000            | 0.280415          | 0.226230            |
| 5         | 0.019362     | 0.980198          | 1.000000            | 0.272776          | 0.225882            |
| 10        | 0.031781     | 1.000000          | 1.000000            | 0.265291          | 0.224732            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Class                          : RandomForestClassifier

Schema
------
Number of examples             : 101
Number of feature columns      : 4
Number of unpacked features    : 4
Number of classes              : 3

Settings
--------
Number of trees                : 30
Max tree depth                 : 6
Training time (sec)            : 0.0322
Training accuracy              : 1.0
Validation accuracy            : 1.0
Training log_loss              : 0.2653
Validation log_loss            : 0.2247
Training auc                   : 1.0
Validation auc                 : nan

PREDICTIONS: 
 +-------------+--------------------+
|    class    |    probability     |
+-------------+--------------------+
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.7770652770996094 |
| Iris-setosa | 0.7989017963409424 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
| Iris-setosa | 0.8016387224197388 |
+-------------+--------------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
EVALUATION: 
 {'accuracy': 0.9767441860465116, 'auc': 0.9952745023738937, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 4

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
| Iris-versicolor | Iris-versicolor |   13  |
| Iris-versicolor |  Iris-virginica |   1   |
|  Iris-virginica |  Iris-virginica |   17  |
+-----------------+-----------------+-------+
[4 rows x 3 columns]
, 'f1_score': 0.9781305114638448, 'log_loss': 0.31948300577932404, 'precision': 0.9814814814814815, 'recall': 0.9761904761904763, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int
	class	int

Rows: 300003

Data:
+-----------+-----+-----+----+----+-------+
| threshold | fpr | tpr | p  | n  | class |
+-----------+-----+-----+----+----+-------+
|    0.0    | 1.0 | 1.0 | 12 | 31 |   0   |
|   1e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   2e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   3e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   4e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   5e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   6e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   7e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   8e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   9e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
+-----------+-----+-----+----+----+-------+
[300003 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

Accuracy:  0.9767441860465116
In [15]:
# Automatically picks the right model based on your data.
model = tc.classifier.create(train_Data, target = 'class', features = ['sepal_length','sepal_width','petal_length','petal_width'])

# Generate predictions (class/probabilities etc.), contained in an SFrame.
predictions = model.classify(test_Data)
print(predictions)

# Evaluate the model, with the results stored in a dictionary
results = model.evaluate(test_Data)        
print(results)

print(); print(results['accuracy'])
print(); print(results['confusion_matrix'])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 104
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.003834     | 1.000000          | 0.666667            | 0.731298          | 0.933594            |
| 2         | 0.009357     | 1.000000          | 0.666667            | 0.515390          | 0.875659            |
| 3         | 0.012927     | 1.000000          | 0.666667            | 0.374311          | 0.870579            |
| 4         | 0.018808     | 1.000000          | 0.666667            | 0.277415          | 0.896293            |
| 5         | 0.024611     | 1.000000          | 0.666667            | 0.208872          | 0.941197            |
| 10        | 0.039268     | 1.000000          | 0.666667            | 0.061323          | 0.955704            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Random forest classifier:
--------------------------------------------------------
Number of examples          : 104
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.009330     | 0.980769          | 0.666667            | 0.257495          | 0.775348            |
| 2         | 0.014835     | 0.990385          | 0.666667            | 0.257274          | 0.563280            |
| 3         | 0.019578     | 0.990385          | 0.666667            | 0.256312          | 0.516609            |
| 4         | 0.024181     | 0.990385          | 0.666667            | 0.254434          | 0.486992            |
| 5         | 0.030355     | 0.990385          | 0.666667            | 0.257450          | 0.484198            |
| 10        | 0.046379     | 0.990385          | 0.666667            | 0.257175          | 0.496671            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Decision tree classifier:
--------------------------------------------------------
Number of examples          : 104
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training Accuracy | Validation Accuracy | Training Log Loss | Validation Log Loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.012329     | 1.000000          | 0.666667            | 0.232260          | 0.923433            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
Logistic regression:
--------------------------------------------------------
Number of examples          : 104
Number of classes           : 3
Number of feature columns   : 4
Number of unpacked features : 4
Number of coefficients      : 10
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training Accuracy | Validation Accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 0         | 1        | 0.000244     | 0.355769          | 0.333333            |
| 1         | 2        | 0.000665     | 0.865385          | 0.666667            |
| 2         | 3        | 0.001119     | 0.942308          | 0.666667            |
| 3         | 4        | 0.001580     | 0.980769          | 0.666667            |
| 4         | 5        | 0.002153     | 0.990385          | 0.666667            |
| 5         | 6        | 0.006230     | 0.990385          | 0.666667            |
| 6         | 7        | 0.016781     | 0.990385          | 0.666667            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.6666666666666666
PROGRESS: RandomForestClassifier          : 0.6666666666666666
PROGRESS: DecisionTreeClassifier          : 0.6666666666666666
PROGRESS: LogisticClassifier              : 0.6666666666666666
PROGRESS: ---------------------------------------------
PROGRESS: Selecting BoostedTreesClassifier based on validation set performance.
+-------------+--------------------+
|    class    |    probability     |
+-------------+--------------------+
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
| Iris-setosa | 0.9495642185211182 |
+-------------+--------------------+
[43 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
{'accuracy': 0.9767441860465116, 'auc': 0.9881862559347346, 'confusion_matrix': Columns:
	target_label	str
	predicted_label	str
	count	int

Rows: 4

Data:
+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
|  Iris-virginica |  Iris-virginica |   16  |
|  Iris-virginica | Iris-versicolor |   1   |
| Iris-versicolor | Iris-versicolor |   14  |
+-----------------+-----------------+-------+
[4 rows x 3 columns]
, 'f1_score': 0.9784047370254267, 'log_loss': 0.1477580383761974, 'precision': 0.9777777777777779, 'recall': 0.9803921568627452, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int
	class	int

Rows: 300003

Data:
+-----------+-----+-----+----+----+-------+
| threshold | fpr | tpr | p  | n  | class |
+-----------+-----+-----+----+----+-------+
|    0.0    | 1.0 | 1.0 | 12 | 31 |   0   |
|   1e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   2e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   3e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   4e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   5e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   6e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   7e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   8e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
|   9e-05   | 1.0 | 1.0 | 12 | 31 |   0   |
+-----------+-----+-----+----+----+-------+
[300003 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

0.9767441860465116

+-----------------+-----------------+-------+
|   target_label  | predicted_label | count |
+-----------------+-----------------+-------+
|   Iris-setosa   |   Iris-setosa   |   12  |
|  Iris-virginica |  Iris-virginica |   16  |
|  Iris-virginica | Iris-versicolor |   1   |
| Iris-versicolor | Iris-versicolor |   14  |
+-----------------+-----------------+-------+
[4 rows x 3 columns]

In [16]:
print("\nOverall Accuracy of the trained models: \n")
print('Logistics_Clasifier: ', results_lc['accuracy'])
print('BoostedTree_Clasifier: ', results_bt['accuracy'])
print('DecisionTree_Clasifier: ', results_dt['accuracy'])
print('NearstNeibourgh_Clasifier: ', results_nn['accuracy'])
print('RandomForest_Clasifier: ', results_rf['accuracy'])
print('Automated_Clasifier: ', results['accuracy'])

print('\nEnd-to-End Applied Machine Learning and Data Science Recipe for Beginners & Business Analysts')
Overall Accuracy of the trained models: 

Logistics_Clasifier:  0.9534883720930233
BoostedTree_Clasifier:  0.9767441860465116
DecisionTree_Clasifier:  0.9534883720930233
NearstNeibourgh_Clasifier:  0.9534883720930233
RandomForest_Clasifier:  0.9767441860465116
Automated_Clasifier:  0.9767441860465116

End-to-End Applied Machine Learning and Data Science Recipe for Beginners & Business Analysts
In [ ]: