In [1]:
# End-to-End Applied Machine Learning & Data Science Recipe with IRIS Dataset
## Title : IRIS Flower Classification using SKLEARN RandomForest Classifier with Monte Carlo Cross Validation
### Knowledge required: Basic Python, Scikit-Learn and Pandas
### System requirements:
###   a) Python (3.X) distribution from Anaconda (Anaconda 3)
# Author: 
##     Nilimesh Halder, PhD
##     BSc in Computer Science and Engineering, 
##         @ Khulna University, Bangladesh.
##     PhD in Artificial Intelligence and Applied Machine Learning, 
##         @ The University of Western Australia, Australia.
In [2]:
#
# Disclaimer :
### The information and recipe presented within this Data Science Recipe is only for educational and coaching purposes for beginners and app-developers. 
### Anyone can practice and apply the recipe presented here, but the reader is taking full responsibility for his/her actions.
### The author of this recipe (code / program) has made every effort to ensure the accuracy of the information was correct at time of publication. 
### The author does not assume and hereby disclaims any liability to any party for any loss, damage, or disruption caused by errors or omissions, 
### whether such errors or omissions result from accident, negligence, or any other cause. 
### Some of the information presented here could be also found in public knowledge domains.
In [3]:
#
# Steps in Applied Machine Learning & Data Science :
## 1. Load Library
## 2. Load Dataset to which Machine Learning Algorithm to be applied
###    Either a) load from a CSV file or b) load from a Database   
## 3. Summarisation of Data to understand dataset (Descriptive Statistics)
## 4. Visualisation of Data to understand dataset (Plots, Graphs etc.)
## 5. Data pre-processing & Data transformation (split into train-test datasets)
## 6. Application of a Machine Learning Algorithm to training dataset 
###   a) setup a ML algorithm and parameter settings
###   b) cross validation setup with training dataset
###   c) training & fitting Algorithm with training Dataset
###   d) evaluation of trained Algorithm (or Model) and result
###   e) saving the trained model for future prediction
## 7. Load the saved model and apply it to new dataset for prediction            
In [5]:
# load necessary libraries

import time
import pandas as pd
import pickle as pk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import cohen_kappa_score, confusion_matrix

start_time = time.time()

# declare contants
filename = 'iris.data.csv'    
In [6]:
# -------------------------------------------------------------------------
# Helper modules for Descriptive Statistics
# -------------------------------------------------------------------------    
def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

def get_top_abs_correlations(df, n=5): 
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()
In [9]:
# -------------------------------------------------------------------------    
# load dataset
# ------------------------------------------------------------------------- 
def load_dataset(filename):
        col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'Species']
        
        dataset = pd.read_csv(filename, sep = ',', names = col_names)
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        
        feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        target = 'Species'
        
        return feature_names, target, dataset

# execute the function
feature_names, target, dataset = load_dataset(filename)    
(150, 5)
   sepal_length  sepal_width  petal_length  petal_width      Species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'Species'],
      dtype='object')
In [10]:
# -------------------------------------------------------------------------    
# find missing values in dataset if exists
# -------------------------------------------------------------------------
def find_missing_value(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))

# execute the function
find_missing_value(feature_names, target, dataset)        
Count Number of Missing Value on Each Column: 
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
Species         0
dtype: int64
In [11]:
# -------------------------------------------------------------------------
# descriptive statistics and correlation matrix
# -------------------------------------------------------------------------    
def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    

data_descriptiveStats(feature_names, target, dataset)
Count Number of Missing Value on Each Column: 

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
None

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

              sepal_length  sepal_width  petal_length  petal_width
sepal_length          1.00        -0.11          0.87         0.82
sepal_width          -0.11         1.00         -0.42        -0.36
petal_length          0.87        -0.42          1.00         0.96
petal_width           0.82        -0.36          0.96         1.00

Ranking of Correlation Coefficients:
                          pairs  corr
5   (petal_length, petal_width)  0.96
1  (sepal_length, petal_length)  0.87
2   (sepal_length, petal_width)  0.82
0   (sepal_length, sepal_width) -0.11
4    (sepal_width, petal_width) -0.36
3   (sepal_width, petal_length) -0.42


Highly correlated variables (Absolute Correlations):

petal_length  petal_width     0.96
sepal_length  petal_length    0.87
              petal_width     0.82
              sepal_width    -0.11
sepal_width   petal_width    -0.36
              petal_length   -0.42
dtype: float64

count             150
unique              3
top       Iris-setosa
freq               50
Name: Species, dtype: object

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64
In [12]:
# -------------------------------------------------------------------------
# data visualisation and correlation graph
# -------------------------------------------------------------------------
def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(2,2,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,4,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # PairPlot using seaborn
        print(); print('Scatter Matrix Plot')
        sns.pairplot(dataset, hue = 'Species')
        plt.show()
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    

data_visualization(feature_names, target, dataset)
BOX plot of each numerical features
Histogram of each Numerical Feature
Correlation Matrix of All Numerical Features
Correlation plot of Numerical features
              sepal_length  sepal_width  petal_length  petal_width
sepal_length          1.00        -0.11          0.87         0.82
sepal_width          -0.11         1.00         -0.42        -0.36
petal_length          0.87        -0.42          1.00         0.96
petal_width           0.82        -0.36          0.96         1.00
Scatter Matrix Plot
/Users/nilimesh/anaconda3/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
PIE Chart of for Target: 
In [13]:
# -------------------------------------------------------------------------
# data split to train and test datasets
# -------------------------------------------------------------------------    
def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.33)
        return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)

# -------------------------------------------------------------------------
# model training
# -------------------------------------------------------------------------    
def training_model(X_train, y_train):
        _value = []; _model = []; _best_features = [];
        # Create different Feature subsets
        F1 = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        F2 = ['sepal_length', 'sepal_width', 'petal_length']
        F3 = ['sepal_length', 'sepal_width', 'petal_width']
        F4 = ['sepal_length', 'petal_length', 'petal_width']
        F5 = [ 'sepal_width', 'petal_length', 'petal_width']
        F6 = ['petal_length', 'petal_width']
        F7 = ['sepal_length', 'sepal_width']
        
        subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]
        
        print(subsets_sum)
        
        # Twelve random sates randomly choosen for the outer-MCCV
        for i in [32,41,45,52,65,72,96,97,112,114,128,142]:
            print ('\n\nRandom state : ', i)
            
            model = RandomForestClassifier(random_state=i, n_estimators=200)   

            #  Split the dataset into two stratified parts, 80% for Outer training set
            X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, 
                                                train_size=0.8, random_state=i, stratify=y_train)

            # Choose k-fold cross-validation technique for the inner loop
            inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

            # Set temporary variables
            best_subset = []
            best_auc = -np.inf

            # Loop over the features combinations
            for subset in subsets_sum:
                score = cross_val_score(model, X=X1_train[subset], y=y1_train, 
                                        cv = inner_cv.split(X1_train[subset], y1_train), 
                                        scoring='accuracy')
                if score.mean() > best_auc:
                    best_auc = score.mean()
                    best_subset = subset

            # Train the model on the Outer training set with the selected feature combination
            model = model.fit(X1_train[best_subset], y1_train)
            
            # Calculate the predicted labels with the model on the Outer test set with the selected feature combination
            y1_pred = model.predict(X1_test[best_subset])
            
            # Calculate the accuracy between predicted and true labels
            acc = accuracy_score(y1_test, y1_pred)
            print('Selected features:', best_subset,'; Outer Test ACC: ',acc)
            
            _best_features.append(best_subset); _value.append(acc); _model.append(model); 
        
        #for i in range(0, len(_value)):
        #    print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])
        
        print(); print(_value)
        print(); print('Maximum Accuracy Index: ', np.argmax(_value))
        
        idx = np.argmax(_value)
        print("\nBest model parameters with random_state:");    print(_model[idx])
        print("\nBest feature combination:");    print(_best_features[idx])
        print("\nBest accuracy from MCCV:");    print(_value[idx])
        
        return(_model[idx], _best_features[idx])


model, feature_names = training_model(X_train, y_train)
[['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], ['sepal_length', 'sepal_width', 'petal_length'], ['sepal_length', 'sepal_width', 'petal_width'], ['sepal_length', 'petal_length', 'petal_width'], ['sepal_width', 'petal_length', 'petal_width'], ['petal_length', 'petal_width'], ['sepal_length', 'sepal_width']]


Random state :  32
Selected features: ['sepal_length', 'sepal_width', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  41
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  45
Selected features: ['sepal_length', 'sepal_width', 'petal_width'] ; Outer Test ACC:  0.9


Random state :  52
Selected features: ['sepal_length', 'sepal_width', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  65
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  1.0


Random state :  72
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  0.9


Random state :  96
Selected features: ['sepal_length', 'sepal_width', 'petal_width'] ; Outer Test ACC:  0.9


Random state :  97
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  112
Selected features: ['petal_length', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  114
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  128
Selected features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ; Outer Test ACC:  0.95


Random state :  142
Selected features: ['sepal_length', 'sepal_width', 'petal_width'] ; Outer Test ACC:  0.95

[0.95, 0.95, 0.9, 0.95, 1.0, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95]

Maximum Accuracy Index:  4

Best model parameters with random_state:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=65, verbose=0, warm_start=False)

Best feature combination:
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

Best accuracy from MCCV:
1.0
In [16]:
# -----------------------------------------------
# Evaluate the skill of the Trained model
# -----------------------------------------------
def evaluate_model(model, features, X_train, y_train, X_test, y_test):        
        print()
        print(model.get_params(deep=True))
        
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test[features])
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test[features])
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(9,6)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(9,6)); plt.show()        
        
        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(9,6), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()        
        
        # Add more ... ... ...
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(9,6))
        plt.show()
        
        return model

model = evaluate_model(model, feature_names, X_train, y_train, X_test, y_test)
{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': 1, 'oob_score': False, 'random_state': 65, 'verbose': 0, 'warm_start': False}

Evaluation of the trained model: 

Accuracy :  0.94

Kappa Score :  0.9098557692307693

Confusion Matrix :
 [[18  0  0]
 [ 0 16  0]
 [ 0  3 13]]

Classification Report :
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       0.84      1.00      0.91        16
 Iris-virginica       1.00      0.81      0.90        16

    avg / total       0.95      0.94      0.94        50

/Users/nilimesh/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [18]:
# ------------------------------------------------
# Feature Rank Analysis
# -------------------------------------------------
def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(9,6)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 45, figsize=(9,6))
        plt.show()
        
featureRank_Analysis(model, dataset, feature_names)
Feature Importance/Rank Analysis: 
1. feature 2 petal_length (0.434299)
2. feature 3 petal_width (0.403042)
3. feature 0 sepal_length (0.142164)
4. feature 1 sepal_width (0.020495)
In [19]:
# ------------------
# save the model
# ------------------
def save_model(model):
        with open('DSC_Recipe_4_model.pickle', 'wb') as f: 
            pk.dump(model, f)

save_model(model) 
In [22]:
# ------------------------------------------------
# Load the model from disk and make predictions
# ------------------------------------------------
def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_4_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'Species']
        dataset = pd.read_csv(filename, sep = ',', names = col_names)
        
        # final prediction and results
        predicted_species     = model.predict(dataset[feature_names])
        dataset['predicted_species'] = predicted_species
        dataset.to_csv('FinalResult.csv', index = False)

final_prediction(feature_names, filename)        
In [23]:
print()
print("Required Time %s seconds: " % (time.time() - start_time))
Required Time 351.0933120250702 seconds: 
In [ ]: