Project 01: Decision Tree with Grid Search CV

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Data Science Coding Recipe - 001
Using Package: scikit-learn, Algorithm: Decision Tree Classifier, DataSet: OpenML mobileset price Dataset
Tuning: Parameters tuning using GridSearchCV
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
In [4]:
import warnings
warnings.filterwarnings("ignore")
In [5]:
def DSC_Recipe_1():
    print()
    print(format('Recipe for Data Science Competition - DSC_Recipe_1','*^65'))
    print(format('Classification with OpenML mobileset price dataset using scikit-learn decision tree and K-fold Cross Valodation', '*^95'))    
    print(format('Package: scikit-learn ','*^65'))            
    print(format('Algorithm: Decision Tree Model','*^65'))            
    print(format('DataSet: OpenML mobileset price Dataset', '*^65'))    
    print(format('Model selection: using GridSearchCV from scikit-learn', '*^65'))    

    # load necessary libraries
    import time
    import pandas as pd
    import pickle as pk
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import scikitplot as skplt
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.metrics import cohen_kappa_score, confusion_matrix
    from sklearn.preprocessing import LabelEncoder    
    start_time = time.time()
    
    # -------------------------------------------------------------------------
    # declare contants
    # -------------------------------------------------------------------------
    kfold = 10
    
    # -------------------------------------------------------------------------
    # Helper modules for Descriptive Statistics
    # -------------------------------------------------------------------------    
    def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5): 
        #au_corr = df.corr().abs().unstack()
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

    # Helper module for Label Encoding for Categorical Features
    def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category',
                                                     'object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

    # -------------------------------------------------------------------------    
    # load dataset
    # ------------------------------------------------------------------------- 
    def load_dataset(filename):
        
        dataset = pd.read_csv(filename, sep = ',')
        
        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                         'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                         'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                         'touch_screen', 'wifi']
        
        target = 'price_range'
        
        dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                                       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                                       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                                       'touch_screen', 'wifi', 'price_range']])
        
        return feature_names, target, dataset

    # -------------------------------------------------------------------------    
    # find missing values in dataset if exists
    # -------------------------------------------------------------------------
    def find_miising_value(feature_names, target, dataset):
        
        print()
        print('#---------------------------------------------------------------')
        print('Check for Mising Value or NaN Value in the Dataset')
        print('#---------------------------------------------------------------')
        # Method - 1
        # Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
    
        # Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Method - 2
        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        if (feature_count):
            print()
            print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
            print('Count Number of Missing Value on Each Column: ')        
            print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove constant columns in the Dataset')
        print('#---------------------------------------------------------------')
        colsToRemove = []
        for col in dataset.columns:
            if col not in ['price_range']:
                if dataset[col].std() == 0: 
                    colsToRemove.append(col)
        print()
        print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
        print(colsToRemove)
        # remove constant columns in the Dataset
        dataset.drop(colsToRemove, axis=1, inplace=True)

        print()
        print('#---------------------------------------------------------------')
        print('Check and Remove Duplicate Columns in the Dataset')
        print('#---------------------------------------------------------------')
        print()
        print(dataset.columns); print(dataset.head(5))
        print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())        
        dataset = dataset.loc[:, ~dataset.columns.duplicated()]
        print()
        print(dataset.columns); print(dataset.head(5))
        
        print()
        print('#---------------------------------------------------------------')
        print('Check and Drop Sparse Data/Columns in the Dataset')
        print('#---------------------------------------------------------------')
        flist = [x for x in dataset.columns if not x in ['price_range']]
        print(); print(flist)
        for f in flist:
            if len(np.unique(dataset[f])) < 2:
                print('Feature contains Sparse Data: ', f)
                dataset.drop(f, axis=1, inplace=True)
        print()
        print(dataset.columns); print(dataset.head(5))
        
        # --------------------------------------------------
        # Missing Values treatment in the DataSet (if any)
        # --------------------------------------------------    
        # a) Filling NULL values with Zeros
        #dataset = dataset.fillna(0)
        #print('\nCount Number of Missing Value on Each Column: ')        
        ## Count Number of Missing Value on Each Column
        #print(dataset.isnull().sum(axis=0))
        #print('\nCount Number of Missing Value on Each Row: ')        
        ## Count Number of Missing Value on Each Row
        #print(dataset.isnull().sum(axis=1))

        # b) Filling NULL values according to their dataTypes
        # Group Dataset according to different dataTypes
        gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
        print('\nGroup Columns according to their dataTypes: \n', gd)  
        colNames = dataset.columns.values.tolist()
        for colName in colNames:
            if dataset[colName].dtypes == 'int64':
                dataset[colName] = dataset[colName].fillna(0)
            if dataset[colName].dtypes == 'float64':
                dataset[colName] = dataset[colName].fillna(0.0) 
            if dataset[colName].dtypes == 'object':
                dataset[colName] = dataset[colName].fillna('Unknown')    

        ## Count Number of Missing Value on Each Column    
        print('\nCount Number of Missing Value on Each Column: ')        
        print(dataset.isnull().sum(axis=0))
        ## Count Number of Missing Value on Each Row    
        #print('\nCount Number of Missing Value on Each Row: ')        
        #print(dataset.isnull().sum(axis=1))

        # Check if there are any missing values in Dataset
        feature_count = dataset.columns[dataset.isnull().sum() != 0].size
        print()
        print("Total Features with missing Values = " + str(feature_count))

        return(dataset)

    # -------------------------------------------------------------------------
    # descriptive statistics and correlation matrix
    # -------------------------------------------------------------------------    
    def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print(dataset[target].isnull().sum(axis=0))    
    
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
    
        # correlation
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr())    
    
        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 75))
    
        # Get Information on the target    
        print(); print(dataset[target].describe())    
        print(); print(dataset.groupby(target).size())    

    # -------------------------------------------------------------------------
    # data visualisation and correlation graph
    # -------------------------------------------------------------------------
    def data_visualization(feature_names, target, dataset):
        # BOX plots USING box and whisker plots
        i = 1
        print(); print('BOX plot of each numerical features')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        plt.show()    
    
        # USING histograms
        j = 1
        print(); print('Histogram of each Numerical Feature')
        plt.figure(figsize=(11,9))     
        for col in feature_names:
            plt.subplot(5,4,j)
            plt.axis('on')
            plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                            labelleft=False, labeltop=False, labelright=False, labelbottom=False)
            dataset[col].hist()
            j += 1
        plt.show()

        # correlation matrix
        print(); print('Correlation Matrix of All Numerical Features')   
        fig = plt.figure(figsize=(11,9))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        ticks = np.arange(0,20,1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        plt.show()

        # Seaborn pairplot
        sns.pairplot(dataset, hue = target)
        plt.show()
        
        # Correlation Plot using seaborn
        print(); print("Correlation plot of Numerical features")
        # Compute the correlation matrix
        corr = dataset[feature_names].corr()
        print(corr)
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Set up the matplotlib figure
        f, ax = plt.subplots(figsize=(11, 9))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True, 
                    linewidths=.5, cbar_kws={"shrink": .5})
        plt.show()    
    
        # Pie chart for Categorical Variables
        print(); print('PIE Chart of for Target: ')
        plt.figure(figsize=(11,9)) 
        i = 1
        for colName in [target]:
            labels = []; sizes = [];
            df = dataset.groupby(colName).size()
            for key in df.keys():
                labels.append(key)
                sizes.append(df[key])
            # Plot PIE Chart with %
            plt.subplot(2,2,i)
            plt.axis('on')
            plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                            labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
            plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
            plt.axis('equal')
            i += 1; plt.savefig('Piefig.pdf', format='pdf')
        plt.show()    

    # -------------------------------------------------------------------------
    # data split to train and test datasets
    # -------------------------------------------------------------------------    
    def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.05)

        return X_train, X_test, y_train, y_test

    def training_model(X_train, y_train):
        model = DecisionTreeClassifier()
        # parameters
        parameters = {'max_depth'     : [4,6,8],
                      'criterion'     : ['gini', 'entropy'],
                      'splitter'      : ['best', 'random'],
                      'min_weight_fraction_leaf'  : [0.0,0.1,0.2,0.3],
                      'random_state' : [7,23,42,78,142],
                      'min_impurity_decrease' : [0.0,0.05,0.1,0.2]
                      # Add more parameters here for tuning
                      }
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            verbose = 1, n_jobs = -1, refit = True)
        grid.fit(X_train, y_train)

        # Results from Grid Search
        print("\n========================================================")
        print(" Results from Grid Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              grid.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              grid.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              grid.best_params_)
        print("\n ========================================================")

        return(grid.best_estimator_)

    def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean Accuracy: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train, y_train, figsize=(6,6))
        plt.show()
        
        return model

    def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        pred_Class          = model.predict(X_test)
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(7,7)); plt.show()        

        return model
    
    def featureRank_Analysis(model, dataset, cols):
        print()
        print("Feature Importance/Rank Analysis: ")
        X = dataset.loc[:, cols]; X_cols = X.columns.values
    
        features_imp = model.feature_importances_    
    
        indices = np.argsort(features_imp)[::-1]
        df = {}
        for f in range(X.shape[1]):
            print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]], 
                                              features_imp[indices[f]]))
            df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

        df1 = pd.DataFrame.from_dict(df, orient = 'index')
        df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
        df1.to_csv("FeatureImportanceRank.csv", index = False)

        # this creates a figure 5 inch wide, 3 inch high
        plt.figure(figsize=(5,3)) 
        plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
        plt.savefig('Featurefig.pdf', format='pdf')
        plt.show()   

        skplt.estimators.plot_feature_importances(model, feature_names=cols,
                                                  x_tick_rotation = 45, figsize=(5,3))
        plt.show()

        # ------------------------------------------------
        # Visualise the tree-graph (GradientBoosting)
        # ------------------------------------------------
        # install graphViz and pydotplus using pip
        # install binaries from graphViz.org and 
        # add PATH variables
        # Follow the instruction @
        # https://stackoverflow.com/questions/18438997/
        # why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
        # ------------------------------------------------
        # Get an arbitary tree number between (0,99) 
        # as "n_estimators = 100"
        #sub_tree_number = 49 
        
        '''
        from sklearn import tree
        from sklearn.externals.six import StringIO  
        import pydotplus
    
        # Create a dot file
        dotfile = open("tree.dot", 'w')
        tree.export_graphviz(
                #model.estimators_[sub_tree_number, 0], 
                model, 
                out_file = dotfile, feature_names = X_cols)
        dotfile.close()    

        # Create pdf and png from the dot data
        dot_data = StringIO()
        tree.export_graphviz(            
                #model.estimators_[sub_tree_number, 0], 
                model, 
                out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names = X_cols)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
        graph.write_png("tree.png")
        graph.write_pdf("tree.pdf")
        '''
        
    def save_model(model):
        with open('DSC_Recipe_1_model.pickle', 'wb') as f: 
            pk.dump(model, f)

    def final_prediction(feature_names, filename):
        # load model
        f = open('DSC_Recipe_1_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        
        dataset['predicted_class'] = predicted_class

        # Evaluate the skill of the Trained model
        acc                 = accuracy_score(dataset['price_range'], predicted_class)
        classReport         = classification_report(dataset['price_range'], predicted_class)
        confMatrix          = confusion_matrix(dataset['price_range'], predicted_class) 
        kappa_score         = cohen_kappa_score(dataset['price_range'], predicted_class)         
        
        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)
        
        # ROC curves
        skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(dataset['price_range'],
                                            predicted_class,figsize=(7,7)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(7,7), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()               
        
        dataset.to_csv('FinalResult.csv', index = False, 
                       columns = ['price_range', 'predicted_class'])


    def final_prediction_with_testDataset(feature_names, filename):
        # load model
        f = open('DSC_Recipe_1_model.pickle', 'rb')
        model = pk.load(f); f.close();
        
        # load dataset
        dataset = pd.read_csv(filename, sep = ',')

        print(dataset.shape);    print(dataset.head(5));    print(dataset.columns);
        print(dataset.dtypes)
        
        dataset = dummyEncode(dataset)
        
        # final prediction and results
        predicted_class     = model.predict(dataset[feature_names])
        pred_proba          = model.predict_proba(dataset[feature_names])        

        dataset['predicted_class'] = predicted_class
        dataset['predicted_proba'] = pred_proba.tolist()
        
        dataset.to_csv('FinalResultWith_testDataset.csv', index = False)
    
    if __name__ == '__main__':
        print()
        print("Execution Time %s seconds: " % (start_time))
        filename = 'mobilePriceClassification_trainDataset.csv'
        
        feature_names, target, dataset = load_dataset(filename)
        dataset = find_miising_value(feature_names, target, dataset)
        data_descriptiveStats(feature_names, target, dataset)
        data_visualization(feature_names, target, dataset)
        X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
        model = training_model(X_train, y_train)
        model = cross_validatin_and_fitting(model, X_train, y_train)
        model = evaluate_model(model, X_test, y_test)
        featureRank_Analysis(model, dataset, feature_names)
        save_model(model) 
        
        test_filename = 'mobilePriceClassification_trainDataset.csv'
        final_prediction(feature_names, test_filename)
        
        test_filename = 'mobilePriceClassification_testDataset.csv'
        final_prediction_with_testDataset(feature_names, test_filename)
        
        print()
        print("Execution Time %s seconds: " % (time.time() - start_time))
In [ ]:
DSC_Recipe_1()
*******Recipe for Data Science Competition - DSC_Recipe_1********
Classification with OpenML mobileset price dataset using scikit-learn decision tree and K-fold Cross Valodation
*********************Package: scikit-learn **********************
*****************Algorithm: Decision Tree Model******************
*************DataSet: OpenML mobileset price Dataset*************
******Model selection: using GridSearchCV from scikit-learn******

Execution Time 1614655386.460959 seconds: 
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed `0` Constant Columns: 
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Duplicate Columns in the Dataset: 
 [False False False False False False False False False False False False
 False False False False False False False False False]

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]

Group Columns according to their dataTypes: 
 {int64: ['battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], float64: ['clock_speed', 'm_dep']}

Count Number of Missing Value on Each Column: 
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column: 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

0

Get Information on the feature variables: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
dtypes: float64(2), int64(18)
memory usage: 312.6 KB
None

       battery_power     blue  clock_speed  dual_sim       fc   four_g  \
count        2000.00  2000.00      2000.00   2000.00  2000.00  2000.00   
mean         1238.52     0.49         1.52      0.51     4.31     0.52   
std           439.42     0.50         0.82      0.50     4.34     0.50   
min           501.00     0.00         0.50      0.00     0.00     0.00   
25%           851.75     0.00         0.70      0.00     1.00     0.00   
50%          1226.00     0.00         1.50      1.00     3.00     1.00   
75%          1615.25     1.00         2.20      1.00     7.00     1.00   
max          1998.00     1.00         3.00      1.00    19.00     1.00   

       int_memory    m_dep  mobile_wt  n_cores       pc  px_height  px_width  \
count     2000.00  2000.00    2000.00  2000.00  2000.00    2000.00   2000.00   
mean        32.05     0.50     140.25     4.52     9.92     645.11   1251.52   
std         18.15     0.29      35.40     2.29     6.06     443.78    432.20   
min          2.00     0.10      80.00     1.00     0.00       0.00    500.00   
25%         16.00     0.20     109.00     3.00     5.00     282.75    874.75   
50%         32.00     0.50     141.00     4.00    10.00     564.00   1247.00   
75%         48.00     0.80     170.00     7.00    15.00     947.25   1633.00   
max         64.00     1.00     200.00     8.00    20.00    1960.00   1998.00   

           ram     sc_h     sc_w  talk_time  three_g  touch_screen     wifi  
count  2000.00  2000.00  2000.00    2000.00  2000.00        2000.0  2000.00  
mean   2124.21    12.31     5.77      11.01     0.76           0.5     0.51  
std    1084.73     4.21     4.36       5.46     0.43           0.5     0.50  
min     256.00     5.00     0.00       2.00     0.00           0.0     0.00  
25%    1207.50     9.00     2.00       6.00     1.00           0.0     0.00  
50%    2146.50    12.00     5.00      11.00     1.00           1.0     1.00  
75%    3064.50    16.00     9.00      16.00     1.00           1.0     1.00  
max    3998.00    19.00    18.00      20.00     1.00           1.0     1.00  

               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  

Ranking of Correlation Coefficients:
                         pairs  corr
75                    (fc, pc)  0.64
96           (four_g, three_g)  0.58
154      (px_height, px_width)  0.51
175               (sc_h, sc_w)  0.51
156          (px_height, sc_h)  0.06
..                         ...   ...
2    (battery_power, dual_sim) -0.04
184       (talk_time, three_g) -0.04
39       (clock_speed, four_g) -0.04
51      (clock_speed, three_g) -0.05
95         (four_g, talk_time) -0.05

[190 rows x 2 columns]


Highly correlated variables (Absolute Correlations):

fc           pc            6.45e-01
four_g       three_g       5.84e-01
px_height    px_width      5.11e-01
sc_h         sc_w          5.06e-01
px_height    sc_h          5.96e-02
                             ...   
four_g       ram           7.31e-03
int_memory   wifi          6.99e-03
             m_dep         6.89e-03
px_width     talk_time     6.72e-03
clock_speed  int_memory    6.55e-03
Length: 75, dtype: float64

count    2000.00
mean        1.50
std         1.12
min         0.00
25%         0.75
50%         1.50
75%         2.25
max         3.00
Name: price_range, dtype: float64

price_range
0    500
1    500
2    500
3    500
dtype: int64

BOX plot of each numerical features
Histogram of each Numerical Feature
Correlation Matrix of All Numerical Features
Correlation plot of Numerical features
               battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02   
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03   
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04   
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02   
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00   
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02   
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02   
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03   
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02   
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02   
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01   
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03   
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03   
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02   
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02   
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02   
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03   
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03   
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02   
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02   

                 four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02   
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03   
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03   
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02   
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01   
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03   
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02   
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02   
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02   
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03   
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00   
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02   
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03   
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02   
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03   
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02   
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02   
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03   
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03   
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03   

               px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02   
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02   
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02   
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02   
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03   
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02   
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03   
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02   
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03   
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02   
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02   
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02   
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03   
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02   
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02   
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02   
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00   
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02   
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02   
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02   

                three_g  touch_screen      wifi  
battery_power  1.15e-02     -1.05e-02 -8.34e-03  
blue          -3.02e-02      1.01e-02 -2.19e-02  
clock_speed   -4.64e-02      1.98e-02 -2.45e-02  
dual_sim      -1.40e-02     -1.71e-02  2.27e-02  
fc             1.79e-03     -1.48e-02  2.01e-02  
four_g         5.84e-01      1.68e-02 -1.76e-02  
int_memory    -9.37e-03     -2.70e-02  6.99e-03  
m_dep         -1.21e-02     -2.64e-03 -2.84e-02  
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04  
n_cores       -1.47e-02      2.38e-02 -9.96e-03  
pc            -1.32e-03     -8.74e-03  5.39e-03  
px_height     -3.12e-02      2.19e-02  5.18e-02  
px_width       3.50e-04     -1.63e-03  3.03e-02  
ram            1.58e-02     -3.05e-02  2.27e-02  
sc_h           1.20e-02     -2.00e-02  2.59e-02  
sc_w           3.09e-02      1.27e-02  3.54e-02  
talk_time     -4.27e-02      1.72e-02 -2.95e-02  
three_g        1.00e+00      1.39e-02  4.32e-03  
touch_screen   1.39e-02      1.00e+00  1.19e-02  
wifi           4.32e-03      1.19e-02  1.00e+00  
PIE Chart of for Target: 
Fitting 10 folds for each of 960 candidates, totalling 9600 fits
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:    3.8s
========================================================
 Results from Grid Search 
========================================================

 The best estimator across ALL searched params:
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=23,
            splitter='best')

 The best score across ALL searched params:
 0.8463157894736842

 The best parameters across ALL searched params:
 {'criterion': 'entropy', 'max_depth': 8, 'min_impurity_decrease': 0.0, 'min_weight_fraction_leaf': 0.0, 'random_state': 23, 'splitter': 'best'}

 ========================================================
[Parallel(n_jobs=-1)]: Done 9600 out of 9600 | elapsed:    6.6s finished
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
Cross Validation results:  [0.81675393 0.84293194 0.82722513 0.85863874 0.85340314 0.84210526
 0.84736842 0.85185185 0.87765957 0.84574468]
CV Mean Accuracy: 0.846368 (Std: 0.015761)

========================================================

{'class_weight': None, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 23, 'splitter': 'best'}

========================================================
Evaluation of the trained model: 

Accuracy :  0.87

Kappa Score :  0.8258773104741495

Confusion Matrix :
 [[19  1  0  0]
 [ 4 27  1  0]
 [ 0  1 19  5]
 [ 0  0  1 22]]

Classification Report :
              precision    recall  f1-score   support

          0       0.83      0.95      0.88        20
          1       0.93      0.84      0.89        32
          2       0.90      0.76      0.83        25
          3       0.81      0.96      0.88        23

avg / total       0.88      0.87      0.87       100

Feature Importance/Rank Analysis: 
1. feature 13 ram (0.676472)
2. feature 0 battery_power (0.119032)
3. feature 12 px_width (0.087911)
4. feature 11 px_height (0.061858)
5. feature 8 mobile_wt (0.013995)
6. feature 10 pc (0.008317)
7. feature 16 talk_time (0.006277)
8. feature 6 int_memory (0.005579)
9. feature 7 m_dep (0.005171)
10. feature 4 fc (0.003699)
11. feature 15 sc_w (0.003243)
12. feature 2 clock_speed (0.002213)
13. feature 9 n_cores (0.001810)
14. feature 5 four_g (0.001185)
15. feature 3 dual_sim (0.001079)
16. feature 19 wifi (0.000876)
17. feature 17 three_g (0.000743)
18. feature 1 blue (0.000540)
19. feature 18 touch_screen (0.000000)
20. feature 14 sc_h (0.000000)
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

Testing Results of the trained model: 

Accuracy :  0.979

Kappa Score :  0.972

Confusion Matrix :
 [[498   2   0   0]
 [  9 490   1   0]
 [  0  17 476   7]
 [  0   0   6 494]]

Classification Report :
              precision    recall  f1-score   support

          0       0.98      1.00      0.99       500
          1       0.96      0.98      0.97       500
          2       0.99      0.95      0.97       500
          3       0.99      0.99      0.99       500

avg / total       0.98      0.98      0.98      2000