Project 08: XGBoost Algorithm with MCCV

--------------------------------------------------------------------------------------------

Data Science Competition Recipe - 008

Package: scikit-learn, Algorithm: Xgboost Classifier, DataSet: OpenML mobileset price Dataset

Tuning: Features tuning using RRSSV / MCCV

--------------------------------------------------------------------------------------------

In [4]:
import warnings
warnings.filterwarnings("ignore")
In [5]:
def DSC_Recipe_8():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_8','*^65'))
    print(format('Classification with OpenML mobileset price dataset using XGBoost & Monte Carlo Cross Validation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Model: XGBoost Model','*^65'))            
    print(format('DataSet: OpenML mobileset price Dataset', '*^65'))    
    print(format('Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)', '*^95'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import train_test_split
    import xgboost
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    import warnings
    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                         'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                         'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                         'touch_screen', 'wifi']
        
        target = 'price_range'
        
        dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                                       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                                       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                                       'touch_screen', 'wifi', 'price_range']])
        
        return feature_names, target, dataset

    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['price_range']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['price_range']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))
        
        return(dataset)

    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    
    
    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,20,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    
    
    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.20)
        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):

        _value = []; _model = []; _best_features = [];

        # Create different Feature subsets
        F1 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F2 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F3 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              
              'touch_screen', 'wifi']
        
        F4 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'
              ]
        
        F5 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F6 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F7 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]
        
        print(subsets_sum)
        
        # Twelve random sates randomly choosen for the outer-MCCV
        for i in [32,41,45,52,65,72,96,97,112,114,128,142]:
            
            warnings.filterwarnings("ignore")
            
            print ('\n\nRandom state : ', i)
            
            model = xgboost.XGBClassifier(objective = 'binary:logistic', n_estimators=200)

            #  Split the dataset into two stratified parts, 80% for Outer training set
            X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, 
                                                train_size=0.8, random_state=i, stratify=y_train)

            # Choose k-fold cross-validation technique for the inner loop
            inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

            # Set temporary variables
            best_subset = []
            best_auc = -np.inf

            # Loop over the features combinations
            for subset in subsets_sum:
                score = cross_val_score(model, X=X1_train[subset], y=y1_train, 
                                        cv = inner_cv.split(X1_train[subset], y1_train), 
                                        scoring='accuracy')
                if score.mean() > best_auc:
                    best_auc = score.mean()
                    best_subset = subset

            # Train the model on the Outer training set with the selected feature combination
            model = model.fit(X1_train[best_subset], y1_train)
            
            # Calculate the predicted labels with the model on the Outer test set with the selected feature combination
            y1_pred = model.predict(X1_test[best_subset])
            
            # Calculate the accuracy between predicted and true labels
            acc = accuracy_score(y1_test, y1_pred)
            print('Selected features:', best_subset,'; Outer Test ACC: ',acc)
            
            _best_features.append(best_subset); _value.append(acc); _model.append(model); 
        
        #for i in range(0, len(_value)):
        #    print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])
        
        print(); print(_value)
        print(); print('Maximum Accuracy Index: ', np.argmax(_value))
        
        idx = np.argmax(_value)
        print("\nBest model parameters with random_state:");    print(_model[idx])
        print("\nBest feature combination:");    print(_best_features[idx])
        print("\nBest accuracy from MCCV:");    print(_value[idx])
        
        return(_model[idx], _best_features[idx])

    def evaluate_model(model, features, X_train, y_train, X_test, y_test):
        
        print()
        print(model.get_params(deep=True))
        
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test[features])
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test[features])
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(8,6)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(6,6)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(9,6), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()
        
        # Add more ... ... ...
        
        # plot learning Curves
        #skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(6,6))
        #plt.show()
        
        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(8,8)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 90, figsize=(8,8))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"
        
        sub_tree_number = 27
        # plot tree from Left to Right
        xgboost.plot_tree(model, num_trees=sub_tree_number, rankdir='LR')
        fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
        fig.savefig('treeOpenML mobileset price-1.png')
        # plot tree top to bottom
        xgboost.plot_tree(model, num_trees=sub_tree_number)
        fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
        fig.savefig('treeOpenML mobileset price-2.png')
        
        # plot feature importance
        xgboost.plot_importance(model); plt.show()
    
    def save_model(model):
        with open('DSC_Recipe_8_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_8_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['price_range'], predicted_class)
        classReport         = classification_report(dataset['price_range'], predicted_class)
        confMatrix          = confusion_matrix(dataset['price_range'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['price_range'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['price_range'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['price_range', 'predicted_class'])

    def final_prediction_with_testDataset(feature_names, filename):
        # load model
        f = open('DSC_Recipe_8_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        

        dataset['predicted_class'] = predicted_class
        dataset['predicted_proba'] = pred_proba.tolist()
        
        dataset.to_csv('FinalResultWith_testDataset.csv', index = False)
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'mobilePriceClassification_trainDataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model, features = training_model(X_train, y_train)
        model = evaluate_model(model, features, X_train, y_train, X_test, y_test)
        featureRank_Analysis(model, dataset, features)
        save_model(model) 
        
        test_filename = 'mobilePriceClassification_trainDataset.csv'
        final_prediction(features, test_filename)
        
        test_filename = 'mobilePriceClassification_testDataset.csv'
        final_prediction_with_testDataset(features, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))   
In [6]:
DSC_Recipe_8()
*******Recipe for Data Science Competition - DSC_Recipe_8********
Classification with OpenML mobileset price dataset using XGBoost & Monte Carlo Cross Validation
*********************Package: scikit-learn **********************
**********************Model: XGBoost Model***********************
*************DataSet: OpenML mobileset price Dataset*************
Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)

Execution Time 1614061750.594958 seconds: 
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False False
 False False False False False False False False False]

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Group Columns according to their dataTypes: 
 {int64: ['battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], float64: ['clock_speed', 'm_dep']}

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
dtypes: float64(2), int64(18)
memory usage: 312.6 KB
None

       battery_power     blue  clock_speed  dual_sim       fc   four_g  \
count        2000.00  2000.00      2000.00   2000.00  2000.00  2000.00   
mean         1238.52     0.49         1.52      0.51     4.31     0.52   
std           439.42     0.50         0.82      0.50     4.34     0.50   
min           501.00     0.00         0.50      0.00     0.00     0.00   
25%           851.75     0.00         0.70      0.00     1.00     0.00   
50%          1226.00     0.00         1.50      1.00     3.00     1.00   
75%          1615.25     1.00         2.20      1.00     7.00     1.00   
max          1998.00     1.00         3.00      1.00    19.00     1.00   

       int_memory    m_dep  mobile_wt  n_cores       pc  px_height  px_width  \
count     2000.00  2000.00    2000.00  2000.00  2000.00    2000.00   2000.00   
mean        32.05     0.50     140.25     4.52     9.92     645.11   1251.52   
std         18.15     0.29      35.40     2.29     6.06     443.78    432.20   
min          2.00     0.10      80.00     1.00     0.00       0.00    500.00   
25%         16.00     0.20     109.00     3.00     5.00     282.75    874.75   
50%         32.00     0.50     141.00     4.00    10.00     564.00   1247.00   
75%         48.00     0.80     170.00     7.00    15.00     947.25   1633.00   
max         64.00     1.00     200.00     8.00    20.00    1960.00   1998.00   

           ram     sc_h     sc_w  talk_time  three_g  touch_screen     wifi  
count  2000.00  2000.00  2000.00    2000.00  2000.00        2000.0  2000.00  
mean   2124.21    12.31     5.77      11.01     0.76           0.5     0.51  
std    1084.73     4.21     4.36       5.46     0.43           0.5     0.50  
min     256.00     5.00     0.00       2.00     0.00           0.0     0.00  
25%    1207.50     9.00     2.00       6.00     1.00           0.0     0.00  
50%    2146.50    12.00     5.00      11.00     1.00           1.0     1.00  
75%    3064.50    16.00     9.00      16.00     1.00           1.0     1.00  
max    3998.00    19.00    18.00      20.00     1.00           1.0     1.00  

               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  

Ranking of Correlation Coefficients:
                         pairs  corr
75                    (fc, pc)  0.64
96           (four_g, three_g)  0.58
154      (px_height, px_width)  0.51
175               (sc_h, sc_w)  0.51
156          (px_height, sc_h)  0.06
..                         ...   ...
2    (battery_power, dual_sim) -0.04
184       (talk_time, three_g) -0.04
39       (clock_speed, four_g) -0.04
51      (clock_speed, three_g) -0.05
95         (four_g, talk_time) -0.05

[190 rows x 2 columns]


Highly correlated variables (Absolute Correlations):

fc             pc           0.64
four_g         three_g      0.58
px_height      px_width     0.51
sc_h           sc_w         0.51
px_height      sc_h         0.06
battery_power  talk_time    0.05
px_height      wifi         0.05
               sc_w         0.04
dtype: float64

count    2000.00
mean        1.50
std         1.12
min         0.00
25%         0.75
50%         1.50
75%         2.25
max         3.00
Name: price_range, dtype: float64

price_range
0    500
1    500
2    500
3    500
dtype: int64

BOX plot of each numerical features
Histogram of each Numerical Feature
Correlation Matrix of All Numerical Features
Correlation plot of Numerical features
               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  
PIE Chart of for Target: 
[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']]


Random state :  32
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.921875


Random state :  41
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.93125


Random state :  45
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.89375


Random state :  52
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9125


Random state :  65
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.925


Random state :  72
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.915625


Random state :  96
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.90625


Random state :  97
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.925


Random state :  112
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.909375


Random state :  114
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.909375


Random state :  128
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.928125


Random state :  142
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9

[0.921875, 0.93125, 0.89375, 0.9125, 0.925, 0.915625, 0.90625, 0.925, 0.909375, 0.909375, 0.928125, 0.9]

Maximum Accuracy Index:  1

Best model parameters with random_state:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

Best feature combination:
['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Best accuracy from MCCV:
0.93125

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 200, 'n_jobs': 1, 'nthread': None, 'objective': 'multi:softprob', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1}

Evaluation of the trained model: 

Accuracy :  0.8925

Kappa Score :  0.8562894264109955

Confusion Matrix :
 [[ 89   4   0   0]
 [  7  91   4   0]
 [  0   9  73   5]
 [  0   0  14 104]]

Classification Report :
              precision    recall  f1-score   support

          0       0.93      0.96      0.94        93
          1       0.88      0.89      0.88       102
          2       0.80      0.84      0.82        87
          3       0.95      0.88      0.92       118

avg / total       0.89      0.89      0.89       400

Feature Importance/Rank Analysis: 
1. feature 10 ram (0.360073)
2. feature 0 battery_power (0.184809)
3. feature 9 px_width (0.144192)
4. feature 8 px_height (0.144192)
5. feature 13 talk_time (0.031885)
6. feature 7 pc (0.030260)
7. feature 12 sc_w (0.027417)
8. feature 2 clock_speed (0.022340)
9. feature 4 fc (0.017669)
10. feature 11 sc_h (0.013607)
11. feature 6 n_cores (0.011576)
12. feature 3 dual_sim (0.003859)
13. feature 15 touch_screen (0.003249)
14. feature 16 wifi (0.001625)
15. feature 1 blue (0.001422)
16. feature 14 three_g (0.001015)
17. feature 5 four_g (0.000812)
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

Testing Results of the trained model: 

Accuracy :  0.9675

Kappa Score :  0.9566666666666667

Confusion Matrix :
 [[495   5   0   0]
 [ 13 481   6   0]
 [  0  15 476   9]
 [  0   0  17 483]]

Classification Report :
              precision    recall  f1-score   support

          0       0.97      0.99      0.98       500
          1       0.96      0.96      0.96       500
          2       0.95      0.95      0.95       500
          3       0.98      0.97      0.97       500

avg / total       0.97      0.97      0.97      2000

(1000, 21)
   id  battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0   1           1043     1          1.8         1  14       0           5   
1   2            841     1          0.5         1   4       1          61   
2   3           1807     1          2.8         0   1       0          27   
3   4           1546     0          0.5         1  18       1          25   
4   5           1434     0          1.4         0  11       1          49   

   m_dep  mobile_wt  ...  pc  px_height  px_width   ram  sc_h  sc_w  \
0    0.1        193  ...  16        226      1412  3476    12     7   
1    0.8        191  ...  12        746       857  3895     6     0   
2    0.9        186  ...   4       1270      1366  2396    17    10   
3    0.5         96  ...  20        295      1752  3893    10     0   
4    0.5        108  ...  18        749       810  1773    15     8   

   talk_time  three_g  touch_screen  wifi  
0          2        0             1     0  
1          7        1             0     0  
2         10        0             1     1  
3          7        1             1     0  
4          7        1             0     1  

[5 rows x 21 columns]
Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')
id                 int64
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
dtype: object

Execution Time 204.0043179988861 seconds: 
In [ ]: