# Project 06 : Gradient Boosting with MCCV

# --------------------------------------------------------------------------------------------
# Data Science Competition Recipe - 006 
# Package: scikit-learn, Algorithm: Gradient Boosting Classifier, DataSet: OpenML bank Churn Dataset
# Tuning: Features tuning using RRSSV / MCCV
# --------------------------------------------------------------------------------------------

import warnings
warnings.filterwarnings("ignore")

def DSC_Recipe_6():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_6','*^65'))
    print(format('Classification with OpenML bank Churn dataset using scikit-learn gradient boosting & Monte Carlo Cross Validation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Model: Gradient Boosting Model','*^65'))            
    print(format('DataSet: OpenML bank Churn Dataset', '*^65'))    
    print(format('Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)', '*^95'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']
        
        target = 'Exited'
        
        dataset = dummyEncode(dataset[['CreditScore', 'Geography',
                                       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
                                       'IsActiveMember', 'EstimatedSalary', 'Exited']])
        
        return feature_names, target, dataset
    
    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['Exited']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['Exited']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))
        
        return(dataset)
    
    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    
    
    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,2,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,10,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    
    
    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.05)
        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):

        _value = []; _model = []; _best_features = [];

        # Create different Feature subsets
        F1 = ['CreditScore', 'Geography',
              'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
              'IsActiveMember', 'EstimatedSalary']
        
        F2 = [
              'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
              'IsActiveMember', 'EstimatedSalary']
        
        F3 = ['CreditScore', 'Geography',
              
              'IsActiveMember', 'EstimatedSalary']
        
        F4 = ['CreditScore', 'Geography',
              'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
              ]
        
        F5 = ['CreditScore', 'Geography',
              'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
              'IsActiveMember', 'EstimatedSalary']
        
        F6 = ['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
              'IsActiveMember', 'EstimatedSalary']
        
        F7 = ['CreditScore', 'Geography',
              'Gender', 'Age', 'Tenure', 'Balance',
              'IsActiveMember', 'EstimatedSalary']
        
        subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]
        
        print(subsets_sum)
        
        # Twelve random sates randomly choosen for the outer-MCCV
        for i in [32,41,45,52,65,72,96,97,112,114,128,142]:
            print ('\n\nRandom state : ', i)
            
            model = GradientBoostingClassifier(random_state=i, n_estimators=200)

            #  Split the dataset into two stratified parts, 80% for Outer training set
            X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, 
                                                train_size=0.8, random_state=i, stratify=y_train)

            # Choose k-fold cross-validation technique for the inner loop
            inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

            # Set temporary variables
            best_subset = []
            best_auc = -np.inf

            # Loop over the features combinations
            for subset in subsets_sum:
                score = cross_val_score(model, X=X1_train[subset], y=y1_train, 
                                        cv = inner_cv.split(X1_train[subset], y1_train), 
                                        scoring='accuracy')
                if score.mean() > best_auc:
                    best_auc = score.mean()
                    best_subset = subset

            # Train the model on the Outer training set with the selected feature combination
            model = model.fit(X1_train[best_subset], y1_train)
            
            # Calculate the predicted labels with the model on the Outer test set with the selected feature combination
            y1_pred = model.predict(X1_test[best_subset])
            
            # Calculate the accuracy between predicted and true labels
            acc = accuracy_score(y1_test, y1_pred)
            print('Selected features:', best_subset,'; Outer Test ACC: ',acc)
            
            _best_features.append(best_subset); _value.append(acc); _model.append(model); 
        
        #for i in range(0, len(_value)):
        #    print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])
        
        print(); print(_value)
        print(); print('Maximum Accuracy Index: ', np.argmax(_value))
        
        idx = np.argmax(_value)
        print("\nBest model parameters with random_state:");    print(_model[idx])
        print("\nBest feature combination:");    print(_best_features[idx])
        print("\nBest accuracy from MCCV:");    print(_value[idx])
        
        return(_model[idx], _best_features[idx])

    def evaluate_model(model, features, X_train, y_train, X_test, y_test):
        
        print()
        print(model.get_params(deep=True))
        
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test[features])
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test[features])
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(9,9)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(9,9)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(9,9), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()
        
        # Add more ... ... ...
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(6,6))
        plt.show()
        
        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(11,11)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 90, figsize=(11,11))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"

        '''
        sub_tree_number = 49 
        from sklearn import tree
        from sklearn.externals.six import StringIO  
        import pydotplus
    
        # Create a dot file
        dotfile = open("tree.dot", 'w')
        tree.export_graphviz(
                model.estimators_[sub_tree_number, 0], 
                #model.estimators_[sub_tree_number], 
                out_file = dotfile, feature_names = X_cols)
        dotfile.close()    

        # Create pdf and png from the dot data
        dot_data = StringIO()
        tree.export_graphviz(            
                model.estimators_[sub_tree_number, 0], 
                #model.estimators_[sub_tree_number], 
                out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names = X_cols)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
        graph.write_png("tree.png")
        graph.write_pdf("tree.pdf")
        '''
    
    def save_model(model):
        with open('DSC_Recipe_6_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_6_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['Exited'], predicted_class)
        classReport         = classification_report(dataset['Exited'], predicted_class)
        confMatrix          = confusion_matrix(dataset['Exited'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['Exited'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['Exited'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['Exited'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['Exited'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['Exited', 'predicted_class'])
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'Bank_Customer_Churn_Modelling_Dataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model, features = training_model(X_train, y_train)
        model = evaluate_model(model, features, X_train, y_train, X_test, y_test)
        featureRank_Analysis(model, dataset, features)
        save_model(model) 
        
        test_filename = 'Bank_Customer_Churn_Modelling_Dataset.csv'
        final_prediction(features, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))

DSC_Recipe_6()

*******Recipe for Data Science Competition - DSC_Recipe_6********
Classification with OpenML bank Churn dataset using scikit-learn gradient boosting & Monte Carlo Cross Validation
*********************Package: scikit-learn **********************
*****************Model: Gradient Boosting Model******************
***************DataSet: OpenML bank Churn Dataset****************
Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)

Execution Time 1604299875.2659845 seconds: 
(10000, 14)
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         79084.10       0  
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False]

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

Group Columns according to their dataTypes: 
 {int64: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited'], float64: ['Balance', 'EstimatedSalary']}

Count Number of Missing Value on Each Column: 
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  int64  
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(2), int64(8)
memory usage: 781.4 KB
None

       CreditScore  Geography    Gender       Age    Tenure    Balance  \
count     10000.00   10000.00  10000.00  10000.00  10000.00   10000.00   
mean        650.53       0.75      0.55     38.92      5.01   76485.89   
std          96.65       0.83      0.50     10.49      2.89   62397.41   
min         350.00       0.00      0.00     18.00      0.00       0.00   
25%         584.00       0.00      0.00     32.00      3.00       0.00   
50%         652.00       0.00      1.00     37.00      5.00   97198.54   
75%         718.00       1.00      1.00     44.00      7.00  127644.24   
max         850.00       2.00      1.00     92.00     10.00  250898.09   

       NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
count       10000.00   10000.00        10000.00         10000.00  
mean            1.53       0.71            0.52        100090.24  
std             0.58       0.46            0.50         57510.49  
min             1.00       0.00            0.00            11.58  
25%             1.00       0.00            0.00         51002.11  
50%             1.00       1.00            1.00        100193.91  
75%             2.00       1.00            1.00        149388.25  
max             4.00       1.00            1.00        199992.48  

                 CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.00e+00   7.89e-03 -2.86e-03 -3.96e-03  8.42e-04   
Geography           7.89e-03   1.00e+00  4.72e-03  2.28e-02  3.74e-03   
Gender             -2.86e-03   4.72e-03  1.00e+00 -2.75e-02  1.47e-02   
Age                -3.96e-03   2.28e-02 -2.75e-02  1.00e+00 -1.00e-02   
Tenure              8.42e-04   3.74e-03  1.47e-02 -1.00e-02  1.00e+00   
Balance             6.27e-03   6.94e-02  1.21e-02  2.83e-02 -1.23e-02   
NumOfProducts       1.22e-02   3.97e-03 -2.19e-02 -3.07e-02  1.34e-02   
HasCrCard          -5.46e-03  -8.52e-03  5.77e-03 -1.17e-02  2.26e-02   
IsActiveMember      2.57e-02   6.72e-03  2.25e-02  8.55e-02 -2.84e-02   
EstimatedSalary    -1.38e-03  -1.37e-03 -8.11e-03 -7.20e-03  7.78e-03   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      6.27e-03       1.22e-02  -5.46e-03        2.57e-02   
Geography        6.94e-02       3.97e-03  -8.52e-03        6.72e-03   
Gender           1.21e-02      -2.19e-02   5.77e-03        2.25e-02   
Age              2.83e-02      -3.07e-02  -1.17e-02        8.55e-02   
Tenure          -1.23e-02       1.34e-02   2.26e-02       -2.84e-02   
Balance          1.00e+00      -3.04e-01  -1.49e-02       -1.01e-02   
NumOfProducts   -3.04e-01       1.00e+00   3.18e-03        9.61e-03   
HasCrCard       -1.49e-02       3.18e-03   1.00e+00       -1.19e-02   
IsActiveMember  -1.01e-02       9.61e-03  -1.19e-02        1.00e+00   
EstimatedSalary  1.28e-02       1.42e-02  -9.93e-03       -1.14e-02   

                 EstimatedSalary  
CreditScore            -1.38e-03  
Geography              -1.37e-03  
Gender                 -8.11e-03  
Age                    -7.20e-03  
Tenure                  7.78e-03  
Balance                 1.28e-02  
NumOfProducts           1.42e-02  
HasCrCard              -9.93e-03  
IsActiveMember         -1.14e-02  
EstimatedSalary         1.00e+00  

Ranking of Correlation Coefficients:
                                pairs      corr
28              (Age, IsActiveMember)  8.55e-02
12               (Geography, Balance)  6.94e-02
25                     (Age, Balance)  2.83e-02
7       (CreditScore, IsActiveMember)  2.57e-02
10                   (Geography, Age)  2.28e-02
32                (Tenure, HasCrCard)  2.26e-02
22           (Gender, IsActiveMember)  2.25e-02
18                   (Gender, Tenure)  1.47e-02
41   (NumOfProducts, EstimatedSalary)  1.42e-02
31            (Tenure, NumOfProducts)  1.34e-02
38         (Balance, EstimatedSalary)  1.28e-02
5        (CreditScore, NumOfProducts)  1.22e-02
19                  (Gender, Balance)  1.21e-02
40    (NumOfProducts, IsActiveMember)  9.61e-03
0            (CreditScore, Geography)  7.89e-03
34          (Tenure, EstimatedSalary)  7.78e-03
15        (Geography, IsActiveMember)  6.72e-03
4              (CreditScore, Balance)  6.27e-03
21                (Gender, HasCrCard)  5.77e-03
9                 (Geography, Gender)  4.72e-03
13         (Geography, NumOfProducts)  3.97e-03
11                (Geography, Tenure)  3.74e-03
39         (NumOfProducts, HasCrCard)  3.18e-03
3               (CreditScore, Tenure)  8.42e-04
16       (Geography, EstimatedSalary) -1.37e-03
8      (CreditScore, EstimatedSalary) -1.38e-03
1               (CreditScore, Gender) -2.86e-03
2                  (CreditScore, Age) -3.96e-03
6            (CreditScore, HasCrCard) -5.46e-03
29             (Age, EstimatedSalary) -7.20e-03
23          (Gender, EstimatedSalary) -8.11e-03
14             (Geography, HasCrCard) -8.52e-03
43       (HasCrCard, EstimatedSalary) -9.93e-03
24                      (Age, Tenure) -1.00e-02
37          (Balance, IsActiveMember) -1.01e-02
44  (IsActiveMember, EstimatedSalary) -1.14e-02
27                   (Age, HasCrCard) -1.17e-02
42        (HasCrCard, IsActiveMember) -1.19e-02
30                  (Tenure, Balance) -1.23e-02
36               (Balance, HasCrCard) -1.49e-02
20            (Gender, NumOfProducts) -2.19e-02
17                      (Gender, Age) -2.75e-02
33           (Tenure, IsActiveMember) -2.84e-02
26               (Age, NumOfProducts) -3.07e-02
35           (Balance, NumOfProducts) -3.04e-01


Highly correlated variables (Absolute Correlations):

Age          IsActiveMember    0.09
Geography    Balance           0.07
Age          Balance           0.03
CreditScore  IsActiveMember    0.03
Geography    Age               0.02
Tenure       HasCrCard         0.02
Gender       IsActiveMember    0.02
             Tenure            0.01
dtype: float64

count    10000.0
mean         0.2
std          0.4
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          1.0
Name: Exited, dtype: float64

Exited
0    7963
1    2037
dtype: int64

BOX plot of each numerical features

Histogram of each Numerical Feature

Correlation Matrix of All Numerical Features

Correlation plot of Numerical features
                 CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.00e+00   7.89e-03 -2.86e-03 -3.96e-03  8.42e-04   
Geography           7.89e-03   1.00e+00  4.72e-03  2.28e-02  3.74e-03   
Gender             -2.86e-03   4.72e-03  1.00e+00 -2.75e-02  1.47e-02   
Age                -3.96e-03   2.28e-02 -2.75e-02  1.00e+00 -1.00e-02   
Tenure              8.42e-04   3.74e-03  1.47e-02 -1.00e-02  1.00e+00   
Balance             6.27e-03   6.94e-02  1.21e-02  2.83e-02 -1.23e-02   
NumOfProducts       1.22e-02   3.97e-03 -2.19e-02 -3.07e-02  1.34e-02   
HasCrCard          -5.46e-03  -8.52e-03  5.77e-03 -1.17e-02  2.26e-02   
IsActiveMember      2.57e-02   6.72e-03  2.25e-02  8.55e-02 -2.84e-02   
EstimatedSalary    -1.38e-03  -1.37e-03 -8.11e-03 -7.20e-03  7.78e-03   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      6.27e-03       1.22e-02  -5.46e-03        2.57e-02   
Geography        6.94e-02       3.97e-03  -8.52e-03        6.72e-03   
Gender           1.21e-02      -2.19e-02   5.77e-03        2.25e-02   
Age              2.83e-02      -3.07e-02  -1.17e-02        8.55e-02   
Tenure          -1.23e-02       1.34e-02   2.26e-02       -2.84e-02   
Balance          1.00e+00      -3.04e-01  -1.49e-02       -1.01e-02   
NumOfProducts   -3.04e-01       1.00e+00   3.18e-03        9.61e-03   
HasCrCard       -1.49e-02       3.18e-03   1.00e+00       -1.19e-02   
IsActiveMember  -1.01e-02       9.61e-03  -1.19e-02        1.00e+00   
EstimatedSalary  1.28e-02       1.42e-02  -9.93e-03       -1.14e-02   

                 EstimatedSalary  
CreditScore            -1.38e-03  
Geography              -1.37e-03  
Gender                 -8.11e-03  
Age                    -7.20e-03  
Tenure                  7.78e-03  
Balance                 1.28e-02  
NumOfProducts           1.42e-02  
HasCrCard              -9.93e-03  
IsActiveMember         -1.14e-02  
EstimatedSalary         1.00e+00

PIE Chart of for Target:

[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], ['Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], ['CreditScore', 'Geography', 'IsActiveMember', 'EstimatedSalary'], ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard'], ['CreditScore', 'Geography', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], ['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'IsActiveMember', 'EstimatedSalary']]


Random state :  32
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8652631578947368


Random state :  41
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8663157894736843


Random state :  45
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8768421052631579


Random state :  52
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8621052631578947


Random state :  65
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8710526315789474


Random state :  72
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8642105263157894


Random state :  96
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8621052631578947


Random state :  97
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.858421052631579


Random state :  112
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8652631578947368


Random state :  114
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8557894736842105


Random state :  128
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8778947368421053


Random state :  142
Selected features: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] ; Outer Test ACC:  0.8636842105263158

[0.8652631578947368, 0.8663157894736843, 0.8768421052631579, 0.8621052631578947, 0.8710526315789474, 0.8642105263157894, 0.8621052631578947, 0.858421052631579, 0.8652631578947368, 0.8557894736842105, 0.8778947368421053, 0.8636842105263158]

Maximum Accuracy Index:  10

Best model parameters with random_state:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=128, subsample=1.0, verbose=0,
              warm_start=False)

Best feature combination:
['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

Best accuracy from MCCV:
0.8778947368421053

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'presort': 'auto', 'random_state': 128, 'subsample': 1.0, 'verbose': 0, 'warm_start': False}

Evaluation of the trained model: 

Accuracy :  0.862

Kappa Score :  0.45504517596512295

Confusion Matrix :
 [[392  17]
 [ 52  39]]

Classification Report :
              precision    recall  f1-score   support

          0       0.88      0.96      0.92       409
          1       0.70      0.43      0.53        91

avg / total       0.85      0.86      0.85       500

Feature Importance/Rank Analysis: 
1. feature 5 Balance (0.249322)
2. feature 9 EstimatedSalary (0.189977)
3. feature 3 Age (0.174565)
4. feature 0 CreditScore (0.118645)
5. feature 6 NumOfProducts (0.094331)
6. feature 1 Geography (0.084905)
7. feature 4 Tenure (0.035215)
8. feature 8 IsActiveMember (0.031170)
9. feature 2 Gender (0.018831)
10. feature 7 HasCrCard (0.003039)

(10000, 14)
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         79084.10       0  
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Testing Results of the trained model: 

Accuracy :  0.8788

Kappa Score :  0.5659904066691375

Confusion Matrix :
 [[7738  225]
 [ 987 1050]]

Classification Report :
              precision    recall  f1-score   support

          0       0.89      0.97      0.93      7963
          1       0.82      0.52      0.63      2037

avg / total       0.87      0.88      0.87     10000

Execution Time 246.30173563957214 seconds: