# Project 01: Decision Tree with GSCV

# -----------------------------------------------------------------------------------------------
# Data Science Competition Recipe - 001
# Using Package: scikit-learn, Algorithm: Decision Tree Classifier, DataSet: OpenML bank Churnl Dataset
# Tuning: Parameters tuning using GridSearchCV
# -----------------------------------------------------------------------------------------------

import warnings
warnings.filterwarnings("ignore")

def DSC_Recipe_1():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_1','*^65'))
    print(format('Classification with OpenML bank Churn dataset using scikit-learn decision tree and K-fold Cross Valodation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Algorithm: Decision Tree Model','*^65'))            
    print(format('DataSet: OpenML bank Churn Dataset', '*^65'))    
    print(format('Model selection: using GridSearchCV from scikit-learn', '*^65'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # declare contants
    # -------------------------------------------------------------------------
    kfold = 10
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']
        
        target = 'Exited'
        
        dataset = dummyEncode(dataset[['CreditScore', 'Geography',
                                       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
                                       'IsActiveMember', 'EstimatedSalary', 'Exited']])
        
        return feature_names, target, dataset

    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['Exited']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['Exited']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        return(dataset)

    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 75))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    

    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,2,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,10,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Seaborn pairplot
        sns.pairplot(dataset, hue = target)
        plt.show()
        
        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    

    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.05)

        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):
        model = DecisionTreeClassifier()
        # parameters
        parameters = {'max_depth'     : [4,6,8],
                      'criterion'     : ['gini', 'entropy'],
                      'splitter'      : ['best', 'random'],
                      'min_weight_fraction_leaf'  : [0.0,0.1,0.2,0.3],
                      'random_state' : [7,23,42,78,142],
                      'min_impurity_decrease' : [0.0,0.05,0.1,0.2]
                      # Add more parameters here for tuning
                      }
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            verbose = 1, n_jobs = -1, refit = True)
        grid.fit(X_train, y_train)

        # Results from Grid Search
        print("\n========================================================")
        print(" Results from Grid Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              grid.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              grid.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              grid.best_params_)
        print("\n ========================================================")

        return(grid.best_estimator_)

    def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean Accuracy: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train, y_train, figsize=(6,6))
        plt.show()
        
        return model

    def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test)
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(7,7)); plt.show()        

        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(5,3)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 45, figsize=(5,3))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"
        #sub_tree_number = 49 
        
        '''
        from sklearn import tree
        from sklearn.externals.six import StringIO  
        import pydotplus
    
        # Create a dot file
        dotfile = open("tree.dot", 'w')
        tree.export_graphviz(
                #model.estimators_[sub_tree_number, 0], 
                model, 
                out_file = dotfile, feature_names = X_cols)
        dotfile.close()    

        # Create pdf and png from the dot data
        dot_data = StringIO()
        tree.export_graphviz(            
                #model.estimators_[sub_tree_number, 0], 
                model, 
                out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names = X_cols)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
        graph.write_png("tree.png")
        graph.write_pdf("tree.pdf")
        '''
        
    def save_model(model):
        with open('DSC_Recipe_1_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_1_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['Exited'], predicted_class)
        classReport         = classification_report(dataset['Exited'], predicted_class)
        confMatrix          = confusion_matrix(dataset['Exited'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['Exited'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['Exited'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['Exited'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['Exited'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['Exited', 'predicted_class'])
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'Bank_Customer_Churn_Modelling_Dataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model = training_model(X_train, y_train)
        model = cross_validatin_and_fitting(model, X_train, y_train)
        model = evaluate_model(model, X_test, y_test)
        featureRank_Analysis(model, dataset, feature_names)
        save_model(model) 
        
        test_filename = 'Bank_Customer_Churn_Modelling_Dataset.csv'
        final_prediction(feature_names, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))

DSC_Recipe_1()

*******Recipe for Data Science Competition - DSC_Recipe_1********
Classification with OpenML bank Churn dataset using scikit-learn decision tree and K-fold Cross Valodation
*********************Package: scikit-learn **********************
*****************Algorithm: Decision Tree Model******************
***************DataSet: OpenML bank Churn Dataset****************
******Model selection: using GridSearchCV from scikit-learn******

Execution Time 1604286981.51894 seconds: 
(10000, 14)
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         79084.10       0  
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False]

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

Group Columns according to their dataTypes: 
 {int64: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited'], float64: ['Balance', 'EstimatedSalary']}

Count Number of Missing Value on Each Column: 
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  int64  
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(2), int64(8)
memory usage: 781.4 KB
None

       CreditScore  Geography    Gender       Age    Tenure    Balance  \
count     10000.00   10000.00  10000.00  10000.00  10000.00   10000.00   
mean        650.53       0.75      0.55     38.92      5.01   76485.89   
std          96.65       0.83      0.50     10.49      2.89   62397.41   
min         350.00       0.00      0.00     18.00      0.00       0.00   
25%         584.00       0.00      0.00     32.00      3.00       0.00   
50%         652.00       0.00      1.00     37.00      5.00   97198.54   
75%         718.00       1.00      1.00     44.00      7.00  127644.24   
max         850.00       2.00      1.00     92.00     10.00  250898.09   

       NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
count       10000.00   10000.00        10000.00         10000.00  
mean            1.53       0.71            0.52        100090.24  
std             0.58       0.46            0.50         57510.49  
min             1.00       0.00            0.00            11.58  
25%             1.00       0.00            0.00         51002.11  
50%             1.00       1.00            1.00        100193.91  
75%             2.00       1.00            1.00        149388.25  
max             4.00       1.00            1.00        199992.48  

                 CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.00e+00   7.89e-03 -2.86e-03 -3.96e-03  8.42e-04   
Geography           7.89e-03   1.00e+00  4.72e-03  2.28e-02  3.74e-03   
Gender             -2.86e-03   4.72e-03  1.00e+00 -2.75e-02  1.47e-02   
Age                -3.96e-03   2.28e-02 -2.75e-02  1.00e+00 -1.00e-02   
Tenure              8.42e-04   3.74e-03  1.47e-02 -1.00e-02  1.00e+00   
Balance             6.27e-03   6.94e-02  1.21e-02  2.83e-02 -1.23e-02   
NumOfProducts       1.22e-02   3.97e-03 -2.19e-02 -3.07e-02  1.34e-02   
HasCrCard          -5.46e-03  -8.52e-03  5.77e-03 -1.17e-02  2.26e-02   
IsActiveMember      2.57e-02   6.72e-03  2.25e-02  8.55e-02 -2.84e-02   
EstimatedSalary    -1.38e-03  -1.37e-03 -8.11e-03 -7.20e-03  7.78e-03   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      6.27e-03       1.22e-02  -5.46e-03        2.57e-02   
Geography        6.94e-02       3.97e-03  -8.52e-03        6.72e-03   
Gender           1.21e-02      -2.19e-02   5.77e-03        2.25e-02   
Age              2.83e-02      -3.07e-02  -1.17e-02        8.55e-02   
Tenure          -1.23e-02       1.34e-02   2.26e-02       -2.84e-02   
Balance          1.00e+00      -3.04e-01  -1.49e-02       -1.01e-02   
NumOfProducts   -3.04e-01       1.00e+00   3.18e-03        9.61e-03   
HasCrCard       -1.49e-02       3.18e-03   1.00e+00       -1.19e-02   
IsActiveMember  -1.01e-02       9.61e-03  -1.19e-02        1.00e+00   
EstimatedSalary  1.28e-02       1.42e-02  -9.93e-03       -1.14e-02   

                 EstimatedSalary  
CreditScore            -1.38e-03  
Geography              -1.37e-03  
Gender                 -8.11e-03  
Age                    -7.20e-03  
Tenure                  7.78e-03  
Balance                 1.28e-02  
NumOfProducts           1.42e-02  
HasCrCard              -9.93e-03  
IsActiveMember         -1.14e-02  
EstimatedSalary         1.00e+00  

Ranking of Correlation Coefficients:
                                pairs      corr
28              (Age, IsActiveMember)  8.55e-02
12               (Geography, Balance)  6.94e-02
25                     (Age, Balance)  2.83e-02
7       (CreditScore, IsActiveMember)  2.57e-02
10                   (Geography, Age)  2.28e-02
32                (Tenure, HasCrCard)  2.26e-02
22           (Gender, IsActiveMember)  2.25e-02
18                   (Gender, Tenure)  1.47e-02
41   (NumOfProducts, EstimatedSalary)  1.42e-02
31            (Tenure, NumOfProducts)  1.34e-02
38         (Balance, EstimatedSalary)  1.28e-02
5        (CreditScore, NumOfProducts)  1.22e-02
19                  (Gender, Balance)  1.21e-02
40    (NumOfProducts, IsActiveMember)  9.61e-03
0            (CreditScore, Geography)  7.89e-03
34          (Tenure, EstimatedSalary)  7.78e-03
15        (Geography, IsActiveMember)  6.72e-03
4              (CreditScore, Balance)  6.27e-03
21                (Gender, HasCrCard)  5.77e-03
9                 (Geography, Gender)  4.72e-03
13         (Geography, NumOfProducts)  3.97e-03
11                (Geography, Tenure)  3.74e-03
39         (NumOfProducts, HasCrCard)  3.18e-03
3               (CreditScore, Tenure)  8.42e-04
16       (Geography, EstimatedSalary) -1.37e-03
8      (CreditScore, EstimatedSalary) -1.38e-03
1               (CreditScore, Gender) -2.86e-03
2                  (CreditScore, Age) -3.96e-03
6            (CreditScore, HasCrCard) -5.46e-03
29             (Age, EstimatedSalary) -7.20e-03
23          (Gender, EstimatedSalary) -8.11e-03
14             (Geography, HasCrCard) -8.52e-03
43       (HasCrCard, EstimatedSalary) -9.93e-03
24                      (Age, Tenure) -1.00e-02
37          (Balance, IsActiveMember) -1.01e-02
44  (IsActiveMember, EstimatedSalary) -1.14e-02
27                   (Age, HasCrCard) -1.17e-02
42        (HasCrCard, IsActiveMember) -1.19e-02
30                  (Tenure, Balance) -1.23e-02
36               (Balance, HasCrCard) -1.49e-02
20            (Gender, NumOfProducts) -2.19e-02
17                      (Gender, Age) -2.75e-02
33           (Tenure, IsActiveMember) -2.84e-02
26               (Age, NumOfProducts) -3.07e-02
35           (Balance, NumOfProducts) -3.04e-01


Highly correlated variables (Absolute Correlations):

Age             IsActiveMember     8.55e-02
Geography       Balance            6.94e-02
Age             Balance            2.83e-02
CreditScore     IsActiveMember     2.57e-02
Geography       Age                2.28e-02
Tenure          HasCrCard          2.26e-02
Gender          IsActiveMember     2.25e-02
                Tenure             1.47e-02
NumOfProducts   EstimatedSalary    1.42e-02
Tenure          NumOfProducts      1.34e-02
Balance         EstimatedSalary    1.28e-02
CreditScore     NumOfProducts      1.22e-02
Gender          Balance            1.21e-02
NumOfProducts   IsActiveMember     9.61e-03
CreditScore     Geography          7.89e-03
Tenure          EstimatedSalary    7.78e-03
Geography       IsActiveMember     6.72e-03
CreditScore     Balance            6.27e-03
Gender          HasCrCard          5.77e-03
Geography       Gender             4.72e-03
                NumOfProducts      3.97e-03
                Tenure             3.74e-03
NumOfProducts   HasCrCard          3.18e-03
CreditScore     Tenure             8.42e-04
Geography       EstimatedSalary   -1.37e-03
CreditScore     EstimatedSalary   -1.38e-03
                Gender            -2.86e-03
                Age               -3.96e-03
                HasCrCard         -5.46e-03
Age             EstimatedSalary   -7.20e-03
Gender          EstimatedSalary   -8.11e-03
Geography       HasCrCard         -8.52e-03
HasCrCard       EstimatedSalary   -9.93e-03
Age             Tenure            -1.00e-02
Balance         IsActiveMember    -1.01e-02
IsActiveMember  EstimatedSalary   -1.14e-02
Age             HasCrCard         -1.17e-02
HasCrCard       IsActiveMember    -1.19e-02
Tenure          Balance           -1.23e-02
Balance         HasCrCard         -1.49e-02
Gender          NumOfProducts     -2.19e-02
                Age               -2.75e-02
Tenure          IsActiveMember    -2.84e-02
Age             NumOfProducts     -3.07e-02
Balance         NumOfProducts     -3.04e-01
dtype: float64

count    10000.0
mean         0.2
std          0.4
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          1.0
Name: Exited, dtype: float64

Exited
0    7963
1    2037
dtype: int64

BOX plot of each numerical features

Histogram of each Numerical Feature

Correlation Matrix of All Numerical Features

Correlation plot of Numerical features
                 CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.00e+00   7.89e-03 -2.86e-03 -3.96e-03  8.42e-04   
Geography           7.89e-03   1.00e+00  4.72e-03  2.28e-02  3.74e-03   
Gender             -2.86e-03   4.72e-03  1.00e+00 -2.75e-02  1.47e-02   
Age                -3.96e-03   2.28e-02 -2.75e-02  1.00e+00 -1.00e-02   
Tenure              8.42e-04   3.74e-03  1.47e-02 -1.00e-02  1.00e+00   
Balance             6.27e-03   6.94e-02  1.21e-02  2.83e-02 -1.23e-02   
NumOfProducts       1.22e-02   3.97e-03 -2.19e-02 -3.07e-02  1.34e-02   
HasCrCard          -5.46e-03  -8.52e-03  5.77e-03 -1.17e-02  2.26e-02   
IsActiveMember      2.57e-02   6.72e-03  2.25e-02  8.55e-02 -2.84e-02   
EstimatedSalary    -1.38e-03  -1.37e-03 -8.11e-03 -7.20e-03  7.78e-03   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      6.27e-03       1.22e-02  -5.46e-03        2.57e-02   
Geography        6.94e-02       3.97e-03  -8.52e-03        6.72e-03   
Gender           1.21e-02      -2.19e-02   5.77e-03        2.25e-02   
Age              2.83e-02      -3.07e-02  -1.17e-02        8.55e-02   
Tenure          -1.23e-02       1.34e-02   2.26e-02       -2.84e-02   
Balance          1.00e+00      -3.04e-01  -1.49e-02       -1.01e-02   
NumOfProducts   -3.04e-01       1.00e+00   3.18e-03        9.61e-03   
HasCrCard       -1.49e-02       3.18e-03   1.00e+00       -1.19e-02   
IsActiveMember  -1.01e-02       9.61e-03  -1.19e-02        1.00e+00   
EstimatedSalary  1.28e-02       1.42e-02  -9.93e-03       -1.14e-02   

                 EstimatedSalary  
CreditScore            -1.38e-03  
Geography              -1.37e-03  
Gender                 -8.11e-03  
Age                    -7.20e-03  
Tenure                  7.78e-03  
Balance                 1.28e-02  
NumOfProducts           1.42e-02  
HasCrCard              -9.93e-03  
IsActiveMember         -1.14e-02  
EstimatedSalary         1.00e+00

PIE Chart of for Target:

Fitting 10 folds for each of 960 candidates, totalling 9600 fits

[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 2880 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 7380 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 9600 out of 9600 | elapsed:   11.4s finished

========================================================
 Results from Grid Search 
========================================================

 The best estimator across ALL searched params:
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=78,
            splitter='best')

 The best score across ALL searched params:
 0.8551578947368421

 The best parameters across ALL searched params:
 {'criterion': 'gini', 'max_depth': 8, 'min_impurity_decrease': 0.0, 'min_weight_fraction_leaf': 0.0, 'random_state': 78, 'splitter': 'best'}

 ========================================================

[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished

Cross Validation results:  [0.85578947 0.86       0.85684211 0.85473684 0.84631579 0.85894737
 0.85263158 0.86631579 0.86       0.84      ]
CV Mean Accuracy: 0.855158 (Std: 0.007096)

========================================================

{'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 78, 'splitter': 'best'}

========================================================