Project 07: XGBoost Algorithm with Grid Search CV

In [5]:
# -----------------------------------------------------------------------------------------------
# Data Science Competition Recipe - 007
# Using Package: scikit-learn, Algorithm: Xgboost Classifier, DataSet: OpenML mobileset price Dataset
# Tuning: Parameters tuning using GridSearchCV
# -----------------------------------------------------------------------------------------------
In [6]:
import warnings
warnings.filterwarnings("ignore")
In [7]:
def DSC_Recipe_7():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_7','*^65'))
    print(format('Classification with OpenML mobileset price dataset using xgboost and Grid Search Cross Valodation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Model: XGBoost Model','*^65'))            
    print(format('DataSet: OpenML mobileset price Dataset', '*^65'))    
    print(format('Model selection: using Grid Search Cross Validation (GSCV)', '*^65'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    import xgboost
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # declare contants
    # -------------------------------------------------------------------------
    kfold = 10
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                         'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                         'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                         'touch_screen', 'wifi']
        
        target = 'price_range'
        
        dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                                       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                                       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                                       'touch_screen', 'wifi', 'price_range']])
        
        return feature_names, target, dataset

    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['price_range']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['price_range']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))
        
        return(dataset)
    
    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    
    
    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,20,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    
    
    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.33)
        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):
        model = xgboost.XGBClassifier(objective = 'binary:logistic')
        
        parameters = {'max_depth'     : [6,8,10],
                      'gamma'         : [0.2,0.5,0.6],
                      'learning_rate' : [0.01, 0.05, 0.1],
                      'n_estimators'  : [100, 500, 1000]
                      # Add more ... ... ... 
                      }
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            verbose = 1, n_jobs = -1, refit = True)
        grid.fit(X_train, y_train)

        # Results from Grid Search
        print("\n========================================================")
        print(" Results from Grid Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              grid.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              grid.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              grid.best_params_)
        print("\n ========================================================")

        return(grid.best_estimator_)

    def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean Accuracy: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
        
        return model

    def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test)
        pred_proba          = model.predict_proba(X_test)
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test)
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(8,6)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(6,6)); plt.show()        
        
        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(9,6), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()
        
        # Add more ... ... ...
        
        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(5,3)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 45, figsize=(5,3))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"
        
        '''
        sub_tree_number = 49 
        # plot tree from Left to Right
        xgboost.plot_tree(model, num_trees=sub_tree_number, rankdir='LR')
        fig = plt.gcf(); fig.set_size_inches(15, 10); plt.show()
        fig.savefig('treeOpenML mobileset price-1.png')
        # plot tree top to bottom
        xgboost.plot_tree(model, num_trees=sub_tree_number)
        fig = plt.gcf(); fig.set_size_inches(15, 10); plt.show()
        fig.savefig('treeOpenML mobileset price-2.png')
        ''' 
        
        # plot feature importance
        xgboost.plot_importance(model); plt.show()
        
    def save_model(model):
        with open('DSC_Recipe_7_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_7_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['price_range'], predicted_class)
        classReport         = classification_report(dataset['price_range'], predicted_class)
        confMatrix          = confusion_matrix(dataset['price_range'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['price_range'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['price_range'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['price_range', 'predicted_class'])

    def final_prediction_with_testDataset(feature_names, filename):
        # load model
        f = open('DSC_Recipe_7_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        

        dataset['predicted_class'] = predicted_class
        dataset['predicted_proba'] = pred_proba.tolist()
        
        dataset.to_csv('FinalResultWith_testDataset.csv', index = False)
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'mobilePriceClassification_trainDataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model = training_model(X_train, y_train)
        model = cross_validatin_and_fitting(model, X_train, y_train)
        model = evaluate_model(model, X_test, y_test)
        featureRank_Analysis(model, dataset, feature_names)
        save_model(model) 
        
        test_filename = 'mobilePriceClassification_trainDataset.csv'
        final_prediction(feature_names, test_filename)
        
        test_filename = 'mobilePriceClassification_testDataset.csv'
        final_prediction_with_testDataset(feature_names, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))
In [8]:
DSC_Recipe_7()
*******Recipe for Data Science Competition - DSC_Recipe_7********
Classification with OpenML mobileset price dataset using xgboost and Grid Search Cross Valodation
*********************Package: scikit-learn **********************
**********************Model: XGBoost Model***********************
*************DataSet: OpenML mobileset price Dataset*************
***Model selection: using Grid Search Cross Validation (GSCV)****

Execution Time 1614830565.0740335 seconds: 
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False False
 False False False False False False False False False]

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Group Columns according to their dataTypes: 
 {int64: ['battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], float64: ['clock_speed', 'm_dep']}

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
dtypes: float64(2), int64(18)
memory usage: 312.6 KB
None

       battery_power     blue  clock_speed  dual_sim       fc   four_g  \
count        2000.00  2000.00      2000.00   2000.00  2000.00  2000.00   
mean         1238.52     0.49         1.52      0.51     4.31     0.52   
std           439.42     0.50         0.82      0.50     4.34     0.50   
min           501.00     0.00         0.50      0.00     0.00     0.00   
25%           851.75     0.00         0.70      0.00     1.00     0.00   
50%          1226.00     0.00         1.50      1.00     3.00     1.00   
75%          1615.25     1.00         2.20      1.00     7.00     1.00   
max          1998.00     1.00         3.00      1.00    19.00     1.00   

       int_memory    m_dep  mobile_wt  n_cores       pc  px_height  px_width  \
count     2000.00  2000.00    2000.00  2000.00  2000.00    2000.00   2000.00   
mean        32.05     0.50     140.25     4.52     9.92     645.11   1251.52   
std         18.15     0.29      35.40     2.29     6.06     443.78    432.20   
min          2.00     0.10      80.00     1.00     0.00       0.00    500.00   
25%         16.00     0.20     109.00     3.00     5.00     282.75    874.75   
50%         32.00     0.50     141.00     4.00    10.00     564.00   1247.00   
75%         48.00     0.80     170.00     7.00    15.00     947.25   1633.00   
max         64.00     1.00     200.00     8.00    20.00    1960.00   1998.00   

           ram     sc_h     sc_w  talk_time  three_g  touch_screen     wifi  
count  2000.00  2000.00  2000.00    2000.00  2000.00        2000.0  2000.00  
mean   2124.21    12.31     5.77      11.01     0.76           0.5     0.51  
std    1084.73     4.21     4.36       5.46     0.43           0.5     0.50  
min     256.00     5.00     0.00       2.00     0.00           0.0     0.00  
25%    1207.50     9.00     2.00       6.00     1.00           0.0     0.00  
50%    2146.50    12.00     5.00      11.00     1.00           1.0     1.00  
75%    3064.50    16.00     9.00      16.00     1.00           1.0     1.00  
max    3998.00    19.00    18.00      20.00     1.00           1.0     1.00  

               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  

Ranking of Correlation Coefficients:
                         pairs  corr
75                    (fc, pc)  0.64
96           (four_g, three_g)  0.58
154      (px_height, px_width)  0.51
175               (sc_h, sc_w)  0.51
156          (px_height, sc_h)  0.06
..                         ...   ...
2    (battery_power, dual_sim) -0.04
184       (talk_time, three_g) -0.04
39       (clock_speed, four_g) -0.04
51      (clock_speed, three_g) -0.05
95         (four_g, talk_time) -0.05

[190 rows x 2 columns]


Highly correlated variables (Absolute Correlations):

fc             pc           0.64
four_g         three_g      0.58
px_height      px_width     0.51
sc_h           sc_w         0.51
px_height      sc_h         0.06
battery_power  talk_time    0.05
px_height      wifi         0.05
               sc_w         0.04
dtype: float64

count    2000.00
mean        1.50
std         1.12
min         0.00
25%         0.75
50%         1.50
75%         2.25
max         3.00
Name: price_range, dtype: float64

price_range
0    500
1    500
2    500
3    500
dtype: int64

BOX plot of each numerical features
Histogram of each Numerical Feature
Correlation Matrix of All Numerical Features
Correlation plot of Numerical features
               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  
PIE Chart of for Target: 
Fitting 10 folds for each of 81 candidates, totalling 810 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  6.7min finished
========================================================
 Results from Grid Search 
========================================================

 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.05, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

 The best score across ALL searched params:
 0.9029850746268657

 The best parameters across ALL searched params:
 {'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500}

 ========================================================
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.6s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.8s finished
Cross Validation results:  [0.91911765 0.91111111 0.91851852 0.91851852 0.87313433 0.91791045
 0.90225564 0.87969925 0.86466165 0.92424242]
CV Mean Accuracy: 0.902917 (Std: 0.020943)

========================================================

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0.2, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 500, 'n_jobs': 1, 'nthread': None, 'objective': 'multi:softprob', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1}

========================================================

Evaluation of the trained model: 

Accuracy :  0.9121212121212121

Kappa Score :  0.8825316455696203

Confusion Matrix :
 [[180  11   0   0]
 [  9 144   6   0]
 [  0   7 132   7]
 [  0   0  18 146]]

Classification Report :
              precision    recall  f1-score   support

          0       0.95      0.94      0.95       191
          1       0.89      0.91      0.90       159
          2       0.85      0.90      0.87       146
          3       0.95      0.89      0.92       164

avg / total       0.91      0.91      0.91       660

Feature Importance/Rank Analysis: 
1. feature 13 ram (0.313790)
2. feature 0 battery_power (0.192585)
3. feature 12 px_width (0.163815)
4. feature 11 px_height (0.134457)
5. feature 8 mobile_wt (0.031454)
6. feature 6 int_memory (0.023067)
7. feature 14 sc_h (0.019711)
8. feature 15 sc_w (0.018202)
9. feature 10 pc (0.017195)
10. feature 16 talk_time (0.016692)
11. feature 2 clock_speed (0.016272)
12. feature 9 n_cores (0.014763)
13. feature 4 fc (0.014427)
14. feature 7 m_dep (0.014343)
15. feature 5 four_g (0.002432)
16. feature 18 touch_screen (0.002349)
17. feature 1 blue (0.001845)
18. feature 3 dual_sim (0.001342)
19. feature 19 wifi (0.000839)
20. feature 17 three_g (0.000419)
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

Testing Results of the trained model: 

Accuracy :  0.971

Kappa Score :  0.9613333333333334

Confusion Matrix :
 [[489  11   0   0]
 [  9 485   6   0]
 [  0   7 486   7]
 [  0   0  18 482]]

Classification Report :
              precision    recall  f1-score   support

          0       0.98      0.98      0.98       500
          1       0.96      0.97      0.97       500
          2       0.95      0.97      0.96       500
          3       0.99      0.96      0.97       500

avg / total       0.97      0.97      0.97      2000

(1000, 21)
   id  battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0   1           1043     1          1.8         1  14       0           5   
1   2            841     1          0.5         1   4       1          61   
2   3           1807     1          2.8         0   1       0          27   
3   4           1546     0          0.5         1  18       1          25   
4   5           1434     0          1.4         0  11       1          49   

   m_dep  mobile_wt  ...  pc  px_height  px_width   ram  sc_h  sc_w  \
0    0.1        193  ...  16        226      1412  3476    12     7   
1    0.8        191  ...  12        746       857  3895     6     0   
2    0.9        186  ...   4       1270      1366  2396    17    10   
3    0.5         96  ...  20        295      1752  3893    10     0   
4    0.5        108  ...  18        749       810  1773    15     8   

   talk_time  three_g  touch_screen  wifi  
0          2        0             1     0  
1          7        1             0     0  
2         10        0             1     1  
3          7        1             1     0  
4          7        1             0     1  

[5 rows x 21 columns]
Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')
id                 int64
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
dtype: object

Execution Time 420.52383375167847 seconds: 
In [ ]: