Project 06: Gradient Boosting Algorithm with MCCV

In [5]:
# --------------------------------------------------------------------------------------------
# Data Science Competition Recipe - 006 
# Package: scikit-learn, Algorithm: Gradient Boosting Classifier, DataSet: OpenML mobileset price Dataset
# Tuning: Features tuning using RRSSV / MCCV
# --------------------------------------------------------------------------------------------
In [6]:
import warnings
warnings.filterwarnings("ignore")
In [7]:
def DSC_Recipe_6():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_6','*^65'))
    print(format('Classification with OpenML mobileset price dataset using scikit-learn gradient boosting & Monte Carlo Cross Validation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Model: Gradient Boosting Model','*^65'))            
    print(format('DataSet: OpenML mobileset price Dataset', '*^65'))    
    print(format('Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)', '*^95'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                         'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                         'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                         'touch_screen', 'wifi']
        
        target = 'price_range'
        
        dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                                       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                                       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                                       'touch_screen', 'wifi', 'price_range']])
        
        return feature_names, target, dataset
    
    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['price_range']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['price_range']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))
        
        return(dataset)
    
    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    
    
    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,20,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    
    
    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.05)
        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):

        _value = []; _model = []; _best_features = [];

        # Create different Feature subsets
        F1 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F2 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F3 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              
              'touch_screen', 'wifi']
        
        F4 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'
              ]
        
        F5 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'm_dep', 
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F6 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'n_cores', 'pc', 'px_height',
              'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        F7 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
              'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
              'touch_screen', 'wifi']
        
        subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]
        
        print(subsets_sum)
        
        # Twelve random sates randomly choosen for the outer-MCCV
        for i in [32,41,45,52,65,72,96,97,112,114,128,142]:
            print ('\n\nRandom state : ', i)
            
            model = GradientBoostingClassifier(random_state=i, n_estimators=200)

            #  Split the dataset into two stratified parts, 80% for Outer training set
            X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, 
                                                train_size=0.8, random_state=i, stratify=y_train)

            # Choose k-fold cross-validation technique for the inner loop
            inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

            # Set temporary variables
            best_subset = []
            best_auc = -np.inf

            # Loop over the features combinations
            for subset in subsets_sum:
                score = cross_val_score(model, X=X1_train[subset], y=y1_train, 
                                        cv = inner_cv.split(X1_train[subset], y1_train), 
                                        scoring='accuracy')
                if score.mean() > best_auc:
                    best_auc = score.mean()
                    best_subset = subset

            # Train the model on the Outer training set with the selected feature combination
            model = model.fit(X1_train[best_subset], y1_train)
            
            # Calculate the predicted labels with the model on the Outer test set with the selected feature combination
            y1_pred = model.predict(X1_test[best_subset])
            
            # Calculate the accuracy between predicted and true labels
            acc = accuracy_score(y1_test, y1_pred)
            print('Selected features:', best_subset,'; Outer Test ACC: ',acc)
            
            _best_features.append(best_subset); _value.append(acc); _model.append(model); 
        
        #for i in range(0, len(_value)):
        #    print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])
        
        print(); print(_value)
        print(); print('Maximum Accuracy Index: ', np.argmax(_value))
        
        idx = np.argmax(_value)
        print("\nBest model parameters with random_state:");    print(_model[idx])
        print("\nBest feature combination:");    print(_best_features[idx])
        print("\nBest accuracy from MCCV:");    print(_value[idx])
        
        return(_model[idx], _best_features[idx])

    def evaluate_model(model, features, X_train, y_train, X_test, y_test):
        
        print()
        print(model.get_params(deep=True))
        
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test[features])
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test[features])
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(9,9)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(9,9)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(9,9), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()
        
        # Add more ... ... ...
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(6,6))
        plt.show()
        
        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(11,11)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 90, figsize=(11,11))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"

        '''
        sub_tree_number = 49 
        from sklearn import tree
        from sklearn.externals.six import StringIO  
        import pydotplus
    
        # Create a dot file
        dotfile = open("tree.dot", 'w')
        tree.export_graphviz(
                model.estimators_[sub_tree_number, 0], 
                #model.estimators_[sub_tree_number], 
                out_file = dotfile, feature_names = X_cols)
        dotfile.close()    

        # Create pdf and png from the dot data
        dot_data = StringIO()
        tree.export_graphviz(            
                model.estimators_[sub_tree_number, 0], 
                #model.estimators_[sub_tree_number], 
                out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names = X_cols)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
        graph.write_png("tree.png")
        graph.write_pdf("tree.pdf")
        '''
    
    def save_model(model):
        with open('DSC_Recipe_6_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_6_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['price_range'], predicted_class)
        classReport         = classification_report(dataset['price_range'], predicted_class)
        confMatrix          = confusion_matrix(dataset['price_range'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['price_range'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['price_range'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['price_range', 'predicted_class'])

    def final_prediction_with_testDataset(feature_names, filename):
        # load model
        f = open('DSC_Recipe_6_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        

        dataset['predicted_class'] = predicted_class
        dataset['predicted_proba'] = pred_proba.tolist()
        
        dataset.to_csv('FinalResultWith_testDataset.csv', index = False)
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'mobilePriceClassification_trainDataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model, features = training_model(X_train, y_train)
        model = evaluate_model(model, features, X_train, y_train, X_test, y_test)
        featureRank_Analysis(model, dataset, features)
        save_model(model) 
        
        test_filename = 'mobilePriceClassification_trainDataset.csv'
        final_prediction(features, test_filename)

        test_filename = 'mobilePriceClassification_testDataset.csv'
        final_prediction_with_testDataset(features, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))
In [8]:
DSC_Recipe_6()
*******Recipe for Data Science Competition - DSC_Recipe_6********
Classification with OpenML mobileset price dataset using scikit-learn gradient boosting & Monte Carlo Cross Validation
*********************Package: scikit-learn **********************
*****************Model: Gradient Boosting Model******************
*************DataSet: OpenML mobileset price Dataset*************
Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)

Execution Time 1614828367.4704208 seconds: 
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False False
 False False False False False False False False False]

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Group Columns according to their dataTypes: 
 {int64: ['battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], float64: ['clock_speed', 'm_dep']}

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
dtypes: float64(2), int64(18)
memory usage: 312.6 KB
None

       battery_power     blue  clock_speed  dual_sim       fc   four_g  \
count        2000.00  2000.00      2000.00   2000.00  2000.00  2000.00   
mean         1238.52     0.49         1.52      0.51     4.31     0.52   
std           439.42     0.50         0.82      0.50     4.34     0.50   
min           501.00     0.00         0.50      0.00     0.00     0.00   
25%           851.75     0.00         0.70      0.00     1.00     0.00   
50%          1226.00     0.00         1.50      1.00     3.00     1.00   
75%          1615.25     1.00         2.20      1.00     7.00     1.00   
max          1998.00     1.00         3.00      1.00    19.00     1.00   

       int_memory    m_dep  mobile_wt  n_cores       pc  px_height  px_width  \
count     2000.00  2000.00    2000.00  2000.00  2000.00    2000.00   2000.00   
mean        32.05     0.50     140.25     4.52     9.92     645.11   1251.52   
std         18.15     0.29      35.40     2.29     6.06     443.78    432.20   
min          2.00     0.10      80.00     1.00     0.00       0.00    500.00   
25%         16.00     0.20     109.00     3.00     5.00     282.75    874.75   
50%         32.00     0.50     141.00     4.00    10.00     564.00   1247.00   
75%         48.00     0.80     170.00     7.00    15.00     947.25   1633.00   
max         64.00     1.00     200.00     8.00    20.00    1960.00   1998.00   

           ram     sc_h     sc_w  talk_time  three_g  touch_screen     wifi  
count  2000.00  2000.00  2000.00    2000.00  2000.00        2000.0  2000.00  
mean   2124.21    12.31     5.77      11.01     0.76           0.5     0.51  
std    1084.73     4.21     4.36       5.46     0.43           0.5     0.50  
min     256.00     5.00     0.00       2.00     0.00           0.0     0.00  
25%    1207.50     9.00     2.00       6.00     1.00           0.0     0.00  
50%    2146.50    12.00     5.00      11.00     1.00           1.0     1.00  
75%    3064.50    16.00     9.00      16.00     1.00           1.0     1.00  
max    3998.00    19.00    18.00      20.00     1.00           1.0     1.00  

               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  

Ranking of Correlation Coefficients:
                         pairs  corr
75                    (fc, pc)  0.64
96           (four_g, three_g)  0.58
154      (px_height, px_width)  0.51
175               (sc_h, sc_w)  0.51
156          (px_height, sc_h)  0.06
..                         ...   ...
2    (battery_power, dual_sim) -0.04
184       (talk_time, three_g) -0.04
39       (clock_speed, four_g) -0.04
51      (clock_speed, three_g) -0.05
95         (four_g, talk_time) -0.05

[190 rows x 2 columns]


Highly correlated variables (Absolute Correlations):

fc             pc           0.64
four_g         three_g      0.58
px_height      px_width     0.51
sc_h           sc_w         0.51
px_height      sc_h         0.06
battery_power  talk_time    0.05
px_height      wifi         0.05
               sc_w         0.04
dtype: float64

count    2000.00
mean        1.50
std         1.12
min         0.00
25%         0.75
50%         1.50
75%         2.25
max         3.00
Name: price_range, dtype: float64

price_range
0    500
1    500
2    500
3    500
dtype: int64

BOX plot of each numerical features
Histogram of each Numerical Feature
Correlation Matrix of All Numerical Features
Correlation plot of Numerical features
               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  
PIE Chart of for Target: 
[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']]


Random state :  32
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9184210526315789


Random state :  41
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.8894736842105263


Random state :  45
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9315789473684211


Random state :  52
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.8921052631578947


Random state :  65
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9105263157894737


Random state :  72
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.9131578947368421


Random state :  96
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9210526315789473


Random state :  97
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.8947368421052632


Random state :  112
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.9157894736842105


Random state :  114
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.8921052631578947


Random state :  128
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.8868421052631579


Random state :  142
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.9184210526315789

[0.9184210526315789, 0.8894736842105263, 0.9315789473684211, 0.8921052631578947, 0.9105263157894737, 0.9131578947368421, 0.9210526315789473, 0.8947368421052632, 0.9157894736842105, 0.8921052631578947, 0.8868421052631579, 0.9184210526315789]

Maximum Accuracy Index:  2

Best model parameters with random_state:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=45, subsample=1.0, verbose=0,
              warm_start=False)

Best feature combination:
['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Best accuracy from MCCV:
0.9315789473684211

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'presort': 'auto', 'random_state': 45, 'subsample': 1.0, 'verbose': 0, 'warm_start': False}

Evaluation of the trained model: 

Accuracy :  0.89

Kappa Score :  0.8506043732174385

Confusion Matrix :
 [[14  3  0  0]
 [ 0 23  1  0]
 [ 0  1 28  2]
 [ 0  0  4 24]]

Classification Report :
              precision    recall  f1-score   support

          0       1.00      0.82      0.90        17
          1       0.85      0.96      0.90        24
          2       0.85      0.90      0.88        31
          3       0.92      0.86      0.89        28

avg / total       0.90      0.89      0.89       100

Feature Importance/Rank Analysis: 
1. feature 10 ram (0.541196)
2. feature 0 battery_power (0.126261)
3. feature 9 px_width (0.105712)
4. feature 8 px_height (0.104996)
5. feature 13 talk_time (0.026017)
6. feature 12 sc_w (0.019718)
7. feature 6 n_cores (0.015313)
8. feature 11 sc_h (0.012968)
9. feature 7 pc (0.012471)
10. feature 4 fc (0.010312)
11. feature 2 clock_speed (0.008709)
12. feature 3 dual_sim (0.007043)
13. feature 5 four_g (0.003289)
14. feature 15 touch_screen (0.002350)
15. feature 16 wifi (0.002209)
16. feature 1 blue (0.001091)
17. feature 14 three_g (0.000343)
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

Testing Results of the trained model: 

Accuracy :  0.9815

Kappa Score :  0.9753333333333334

Confusion Matrix :
 [[493   7   0   0]
 [  2 494   4   0]
 [  0   5 487   8]
 [  0   0  11 489]]

Classification Report :
              precision    recall  f1-score   support

          0       1.00      0.99      0.99       500
          1       0.98      0.99      0.98       500
          2       0.97      0.97      0.97       500
          3       0.98      0.98      0.98       500

avg / total       0.98      0.98      0.98      2000

(1000, 21)
   id  battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0   1           1043     1          1.8         1  14       0           5   
1   2            841     1          0.5         1   4       1          61   
2   3           1807     1          2.8         0   1       0          27   
3   4           1546     0          0.5         1  18       1          25   
4   5           1434     0          1.4         0  11       1          49   

   m_dep  mobile_wt  ...  pc  px_height  px_width   ram  sc_h  sc_w  \
0    0.1        193  ...  16        226      1412  3476    12     7   
1    0.8        191  ...  12        746       857  3895     6     0   
2    0.9        186  ...   4       1270      1366  2396    17    10   
3    0.5         96  ...  20        295      1752  3893    10     0   
4    0.5        108  ...  18        749       810  1773    15     8   

   talk_time  three_g  touch_screen  wifi  
0          2        0             1     0  
1          7        1             0     0  
2         10        0             1     1  
3          7        1             1     0  
4          7        1             0     1  

[5 rows x 21 columns]
Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')
id                 int64
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
dtype: object

Execution Time 580.689519405365 seconds: 
In [ ]: