Project 08: XGBoost Algorithm with MCCV¶

--------------------------------------------------------------------------------------------¶

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
def DSC_Recipe_8():
print()
print(format('Recipe for Data Science Competition - DSC_Recipe_8','*^65'))
print(format('Classification with OpenML mobileset price dataset using XGBoost & Monte Carlo Cross Validation', '*^95'))
print(format('Package: scikit-learn ','*^65'))
print(format('Model: XGBoost Model','*^65'))
print(format('DataSet: OpenML mobileset price Dataset', '*^65'))
print(format('Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)', '*^95'))

import time
import pandas as pd
import pickle as pk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import warnings

start_time = time.time()

# -------------------------------------------------------------------------
# Helper modules for Descriptive Statistics
# -------------------------------------------------------------------------
def get_redundant_pairs(df):
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
return pairs_to_drop

def get_top_abs_correlations(df, n=5):
#au_corr = df.corr().abs().unstack()
au_corr = df.corr().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
return au_corr[0:n]

def corrank(X):
import itertools
df = pd.DataFrame([[(i,j),
X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
columns=['pairs','corr'])
print(df.sort_values(by='corr',ascending=False))
print()

# Helper module for Label Encoding for Categorical Features
def dummyEncode(df):
columnsToEncode = list(df.select_dtypes(include=['category',
'object']))
le = LabelEncoder()
for feature in columnsToEncode:
try:
df[feature] = le.fit_transform(df[feature])
except:
print('Error encoding '+feature)
return df

# -------------------------------------------------------------------------
# -------------------------------------------------------------------------

dataset = pd.read_csv(filename, sep = ',')

print(dataset.dtypes)

feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

target = 'price_range'

dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range']])

return feature_names, target, dataset

# -------------------------------------------------------------------------
# find missing values in dataset if exists
# -------------------------------------------------------------------------
def find_miising_value(feature_names, target, dataset):

print()
print('#---------------------------------------------------------------')
print('Check for Mising Value or NaN Value in the Dataset')
print('#---------------------------------------------------------------')
# Method - 1
# Count Number of Missing Value on Each Column
print('\nCount Number of Missing Value on Each Column: ')
print(dataset.isnull().sum(axis=0))

# Count Number of Missing Value on Each Row
#print('\nCount Number of Missing Value on Each Row: ')
#print(dataset.isnull().sum(axis=1))

# Method - 2
# Check if there are any missing values in Dataset
feature_count = dataset.columns[dataset.isnull().sum() != 0].size
print()
print("Total Features with missing Values = " + str(feature_count))

if (feature_count):
print()
print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
print('Count Number of Missing Value on Each Column: ')
print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))

print()
print('#---------------------------------------------------------------')
print('Check and Remove constant columns in the Dataset')
print('#---------------------------------------------------------------')
colsToRemove = []
for col in dataset.columns:
if col not in ['price_range']:
if dataset[col].std() == 0:
colsToRemove.append(col)
print()
print("Removed {} Constant Columns: ".format(len(colsToRemove)))
print(colsToRemove)
# remove constant columns in the Dataset
dataset.drop(colsToRemove, axis=1, inplace=True)

print()
print('#---------------------------------------------------------------')
print('Check and Remove Duplicate Columns in the Dataset')
print('#---------------------------------------------------------------')
print()
print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())
dataset = dataset.loc[:, ~dataset.columns.duplicated()]
print()

print()
print('#---------------------------------------------------------------')
print('Check and Drop Sparse Data/Columns in the Dataset')
print('#---------------------------------------------------------------')
flist = [x for x in dataset.columns if not x in ['price_range']]
print(); print(flist)
for f in flist:
if len(np.unique(dataset[f])) < 2:
print('Feature contains Sparse Data: ', f)
dataset.drop(f, axis=1, inplace=True)
print()

# --------------------------------------------------
# Missing Values treatment in the DataSet (if any)
# --------------------------------------------------
# a) Filling NULL values with Zeros
#dataset = dataset.fillna(0)
#print('\nCount Number of Missing Value on Each Column: ')
## Count Number of Missing Value on Each Column
#print(dataset.isnull().sum(axis=0))
#print('\nCount Number of Missing Value on Each Row: ')
## Count Number of Missing Value on Each Row
#print(dataset.isnull().sum(axis=1))

# b) Filling NULL values according to their dataTypes
# Group Dataset according to different dataTypes
gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
print('\nGroup Columns according to their dataTypes: \n', gd)
colNames = dataset.columns.values.tolist()
for colName in colNames:
if dataset[colName].dtypes == 'int64':
dataset[colName] = dataset[colName].fillna(0)
if dataset[colName].dtypes == 'float64':
dataset[colName] = dataset[colName].fillna(0.0)
if dataset[colName].dtypes == 'object':
dataset[colName] = dataset[colName].fillna('Unknown')

## Count Number of Missing Value on Each Column
print('\nCount Number of Missing Value on Each Column: ')
print(dataset.isnull().sum(axis=0))
## Count Number of Missing Value on Each Row
#print('\nCount Number of Missing Value on Each Row: ')
#print(dataset.isnull().sum(axis=1))

# Check if there are any missing values in Dataset
feature_count = dataset.columns[dataset.isnull().sum() != 0].size
print()
print("Total Features with missing Values = " + str(feature_count))

return(dataset)

# -------------------------------------------------------------------------
# descriptive statistics and correlation matrix
# -------------------------------------------------------------------------
def data_descriptiveStats(feature_names, target, dataset):
# Count Number of Missing Value on Each Column
print(); print('Count Number of Missing Value on Each Column: ')
print(); print(dataset[feature_names].isnull().sum(axis=0))
print(); print(dataset[target].isnull().sum(axis=0))

# Get Information on the feature variables
print(); print('Get Information on the feature variables: ')
print(); print(dataset[feature_names].info())
print(); print(dataset[feature_names].describe())

# correlation
pd.set_option('precision', 2)
print(); print(dataset[feature_names].corr())

# Ranking of Correlation Coefficients among Variable Pairs
print(); print("Ranking of Correlation Coefficients:")
corrank(dataset[feature_names])

# Print Highly Correlated Variables
print(); print("Highly correlated variables (Absolute Correlations):")
print(); print(get_top_abs_correlations(dataset[feature_names], 8))

# Get Information on the target
print(); print(dataset[target].describe())
print(); print(dataset.groupby(target).size())

# -------------------------------------------------------------------------
# data visualisation and correlation graph
# -------------------------------------------------------------------------
def data_visualization(feature_names, target, dataset):
# BOX plots USING box and whisker plots
i = 1
print(); print('BOX plot of each numerical features')
plt.figure(figsize=(11,9))
for col in feature_names:
plt.subplot(5,4,i)
plt.axis('on')
plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True,
labelleft=False, labeltop=False, labelright=False, labelbottom=False)
dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
i += 1
plt.show()

# USING histograms
j = 1
print(); print('Histogram of each Numerical Feature')
plt.figure(figsize=(11,9))
for col in feature_names:
plt.subplot(5,4,j)
plt.axis('on')
plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False,
labelleft=False, labeltop=False, labelright=False, labelbottom=False)
dataset[col].hist()
j += 1
plt.show()

# correlation matrix
print(); print('Correlation Matrix of All Numerical Features')
fig = plt.figure(figsize=(11,9))
cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0,20,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
plt.show()

# Correlation Plot using seaborn
print(); print("Correlation plot of Numerical features")
# Compute the correlation matrix
corr = dataset[feature_names].corr()
print(corr)
# Generate a mask for the upper triangle
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

# Pie chart for Categorical Variables
print(); print('PIE Chart of for Target: ')
plt.figure(figsize=(11,9))
i = 1
for colName in [target]:
labels = []; sizes = [];
df = dataset.groupby(colName).size()
for key in df.keys():
labels.append(key)
sizes.append(df[key])
# Plot PIE Chart with %
plt.subplot(2,2,i)
plt.axis('on')
plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False,
labelleft=True, labeltop=True, labelright=False, labelbottom=False)
plt.axis('equal')
i += 1; plt.savefig('Piefig.pdf', format='pdf')
plt.show()

# -------------------------------------------------------------------------
# data split to train and test datasets
# -------------------------------------------------------------------------
def data_split(feature_names, target, dataset):
# Data Transform - Split train : test datasets
X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names],
dataset.loc[:, target], test_size=0.20)
return X_train, X_test, y_train, y_test

def training_model(X_train, y_train):

_value = []; _model = []; _best_features = [];

# Create different Feature subsets
F1 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

F2 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',

'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

F3 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',

'touch_screen', 'wifi']

F4 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'
]

F5 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

F6 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

F7 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']

subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]

print(subsets_sum)

# Twelve random sates randomly choosen for the outer-MCCV
for i in [32,41,45,52,65,72,96,97,112,114,128,142]:

warnings.filterwarnings("ignore")

print ('\n\nRandom state : ', i)

model = xgboost.XGBClassifier(objective = 'binary:logistic', n_estimators=200)

#  Split the dataset into two stratified parts, 80% for Outer training set
X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train,
train_size=0.8, random_state=i, stratify=y_train)

# Choose k-fold cross-validation technique for the inner loop
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

# Set temporary variables
best_subset = []
best_auc = -np.inf

# Loop over the features combinations
for subset in subsets_sum:
score = cross_val_score(model, X=X1_train[subset], y=y1_train,
cv = inner_cv.split(X1_train[subset], y1_train),
scoring='accuracy')
if score.mean() > best_auc:
best_auc = score.mean()
best_subset = subset

# Train the model on the Outer training set with the selected feature combination
model = model.fit(X1_train[best_subset], y1_train)

# Calculate the predicted labels with the model on the Outer test set with the selected feature combination
y1_pred = model.predict(X1_test[best_subset])

# Calculate the accuracy between predicted and true labels
acc = accuracy_score(y1_test, y1_pred)
print('Selected features:', best_subset,'; Outer Test ACC: ',acc)

_best_features.append(best_subset); _value.append(acc); _model.append(model);

#for i in range(0, len(_value)):
#    print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])

print(); print(_value)
print(); print('Maximum Accuracy Index: ', np.argmax(_value))

idx = np.argmax(_value)
print("\nBest model parameters with random_state:");    print(_model[idx])
print("\nBest feature combination:");    print(_best_features[idx])
print("\nBest accuracy from MCCV:");    print(_value[idx])

return(_model[idx], _best_features[idx])

def evaluate_model(model, features, X_train, y_train, X_test, y_test):

print()
print(model.get_params(deep=True))

# Evaluate the skill of the Trained model
pred_Class          = model.predict(X_test[features])
acc                 = accuracy_score(y_test, pred_Class)
classReport         = classification_report(y_test, pred_Class)
confMatrix          = confusion_matrix(y_test, pred_Class)
kappa_score         = cohen_kappa_score(y_test, pred_Class)

print(); print('Evaluation of the trained model: ')
print(); print('Accuracy : ', acc)
print(); print('Kappa Score : ', kappa_score)
print(); print('Confusion Matrix :\n', confMatrix)
print(); print('Classification Report :\n',classReport)

pred_proba = model.predict_proba(X_test[features])

# Add more plots here using scikit-plot
# ROC curves
skplt.metrics.plot_roc(y_test,pred_proba,figsize=(8,6)); plt.show()

# Confusion matrix
skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(6,6)); plt.show()

# precision recall curve
skplt.metrics.plot_precision_recall(y_test, pred_proba,
title='Precision-Recall Curve', plot_micro=True,
classes_to_plot=None, ax=None, figsize=(9,6),
cmap='nipy_spectral', title_fontsize='large',
text_fontsize='medium'); plt.show()

# Add more ... ... ...

# plot learning Curves
#skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(6,6))
#plt.show()

return model

def featureRank_Analysis(model, dataset, cols):
print()
print("Feature Importance/Rank Analysis: ")
X = dataset.loc[:, cols]; X_cols = X.columns.values

features_imp = model.feature_importances_

indices = np.argsort(features_imp)[::-1]
df = {}
for f in range(X.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]],
features_imp[indices[f]]))
df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]

df1 = pd.DataFrame.from_dict(df, orient = 'index')
df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
df1.to_csv("FeatureImportanceRank.csv", index = False)

# this creates a figure 5 inch wide, 3 inch high
plt.figure(figsize=(8,8))
plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
plt.savefig('Featurefig.pdf', format='pdf')
plt.show()

skplt.estimators.plot_feature_importances(model, feature_names=cols,
x_tick_rotation = 90, figsize=(8,8))
plt.show()

# ------------------------------------------------
# ------------------------------------------------
# install graphViz and pydotplus using pip
# install binaries from graphViz.org and
# https://stackoverflow.com/questions/18438997/
# why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
# ------------------------------------------------
# Get an arbitary tree number between (0,99)
# as "n_estimators = 100"

sub_tree_number = 27
# plot tree from Left to Right
xgboost.plot_tree(model, num_trees=sub_tree_number, rankdir='LR')
fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
fig.savefig('treeOpenML mobileset price-1.png')
# plot tree top to bottom
xgboost.plot_tree(model, num_trees=sub_tree_number)
fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
fig.savefig('treeOpenML mobileset price-2.png')

# plot feature importance
xgboost.plot_importance(model); plt.show()

def save_model(model):
with open('DSC_Recipe_8_model.pickle', 'wb') as f:
pk.dump(model, f)

def final_prediction(feature_names, filename):
f = open('DSC_Recipe_8_model.pickle', 'rb')

dataset = pd.read_csv(filename, sep = ',')

print(dataset.dtypes)

dataset = dummyEncode(dataset)

# final prediction and results
predicted_class     = model.predict(dataset[feature_names])
pred_proba          = model.predict_proba(dataset[feature_names])
dataset['predicted_class'] = predicted_class

# Evaluate the skill of the Trained model
acc                 = accuracy_score(dataset['price_range'], predicted_class)
classReport         = classification_report(dataset['price_range'], predicted_class)
confMatrix          = confusion_matrix(dataset['price_range'], predicted_class)
kappa_score         = cohen_kappa_score(dataset['price_range'], predicted_class)

print(); print('Testing Results of the trained model: ')
print(); print('Accuracy : ', acc)
print(); print('Kappa Score : ', kappa_score)
print(); print('Confusion Matrix :\n', confMatrix)
print(); print('Classification Report :\n',classReport)

# ROC curves
skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()

# Confusion matrix
skplt.metrics.plot_confusion_matrix(dataset['price_range'],
predicted_class,figsize=(7,7)); plt.show()

# precision recall curve
skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba,
title='Precision-Recall Curve', plot_micro=True,
classes_to_plot=None, ax=None, figsize=(7,7),
cmap='nipy_spectral', title_fontsize='large',
text_fontsize='medium'); plt.show()

dataset.to_csv('FinalResult.csv', index = False,
columns = ['price_range', 'predicted_class'])

def final_prediction_with_testDataset(feature_names, filename):
f = open('DSC_Recipe_8_model.pickle', 'rb')

dataset = pd.read_csv(filename, sep = ',')

print(dataset.dtypes)

dataset = dummyEncode(dataset)

# final prediction and results
predicted_class     = model.predict(dataset[feature_names])
pred_proba          = model.predict_proba(dataset[feature_names])

dataset['predicted_class'] = predicted_class
dataset['predicted_proba'] = pred_proba.tolist()

dataset.to_csv('FinalResultWith_testDataset.csv', index = False)

if __name__ == '__main__':
print()
print("Execution Time %s seconds: " % (start_time))
filename = 'mobilePriceClassification_trainDataset.csv'

dataset = find_miising_value(feature_names, target, dataset)
data_descriptiveStats(feature_names, target, dataset)
data_visualization(feature_names, target, dataset)
X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
model, features = training_model(X_train, y_train)
model = evaluate_model(model, features, X_train, y_train, X_test, y_test)
featureRank_Analysis(model, dataset, features)
save_model(model)

test_filename = 'mobilePriceClassification_trainDataset.csv'
final_prediction(features, test_filename)

test_filename = 'mobilePriceClassification_testDataset.csv'
final_prediction_with_testDataset(features, test_filename)

print()
print("Execution Time %s seconds: " % (time.time() - start_time))

In [6]:
DSC_Recipe_8()

*******Recipe for Data Science Competition - DSC_Recipe_8********
Classification with OpenML mobileset price dataset using XGBoost & Monte Carlo Cross Validation
*********************Package: scikit-learn **********************
**********************Model: XGBoost Model***********************
*************DataSet: OpenML mobileset price Dataset*************
Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)

Execution Time 1614061750.594958 seconds:
(2000, 21)
battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6
1           1021     1          0.5         1   0       1          53    0.7
2            563     1          0.5         1   2       1          41    0.9
3            615     1          2.5         0   0       0          10    0.8
4           1821     1          1.2         0  13       1          44    0.6

mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19
1        136        3  ...        905      1988  2631    17     3          7
2        145        5  ...       1263      1716  2603    11     2          9
3        131        6  ...       1216      1786  2769    16     8         11
4        141        2  ...       1208      1212  1411     8     2         15

three_g  touch_screen  wifi  price_range
0        0             0     1            1
1        1             1     0            2
2        1             1     0            2
3        1             0     0            2
4        1             1     0            1

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

#---------------------------------------------------------------
Check for Mising Value or NaN Value in the Dataset
#---------------------------------------------------------------

Count Number of Missing Value on Each Column:
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

#---------------------------------------------------------------
Check and Remove constant columns in the Dataset
#---------------------------------------------------------------

Removed 0 Constant Columns:
[]

#---------------------------------------------------------------
Check and Remove Duplicate Columns in the Dataset
#---------------------------------------------------------------

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6
1           1021     1          0.5         1   0       1          53    0.7
2            563     1          0.5         1   2       1          41    0.9
3            615     1          2.5         0   0       0          10    0.8
4           1821     1          1.2         0  13       1          44    0.6

mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19
1        136        3  ...        905      1988  2631    17     3          7
2        145        5  ...       1263      1716  2603    11     2          9
3        131        6  ...       1216      1786  2769    16     8         11
4        141        2  ...       1208      1212  1411     8     2         15

three_g  touch_screen  wifi  price_range
0        0             0     1            1
1        1             1     0            2
2        1             1     0            2
3        1             0     0            2
4        1             1     0            1

[5 rows x 21 columns]

Duplicate Columns in the Dataset:
[False False False False False False False False False False False False
False False False False False False False False False]

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6
1           1021     1          0.5         1   0       1          53    0.7
2            563     1          0.5         1   2       1          41    0.9
3            615     1          2.5         0   0       0          10    0.8
4           1821     1          1.2         0  13       1          44    0.6

mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19
1        136        3  ...        905      1988  2631    17     3          7
2        145        5  ...       1263      1716  2603    11     2          9
3        131        6  ...       1216      1786  2769    16     8         11
4        141        2  ...       1208      1212  1411     8     2         15

three_g  touch_screen  wifi  price_range
0        0             0     1            1
1        1             1     0            2
2        1             1     0            2
3        1             0     0            2
4        1             1     0            1

[5 rows x 21 columns]

#---------------------------------------------------------------
Check and Drop Sparse Data/Columns in the Dataset
#---------------------------------------------------------------

['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6
1           1021     1          0.5         1   0       1          53    0.7
2            563     1          0.5         1   2       1          41    0.9
3            615     1          2.5         0   0       0          10    0.8
4           1821     1          1.2         0  13       1          44    0.6

mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19
1        136        3  ...        905      1988  2631    17     3          7
2        145        5  ...       1263      1716  2603    11     2          9
3        131        6  ...       1216      1786  2769    16     8         11
4        141        2  ...       1208      1212  1411     8     2         15

three_g  touch_screen  wifi  price_range
0        0             0     1            1
1        1             1     0            2
2        1             1     0            2
3        1             0     0            2
4        1             1     0            1

[5 rows x 21 columns]

Group Columns according to their dataTypes:
{int64: ['battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], float64: ['clock_speed', 'm_dep']}

Count Number of Missing Value on Each Column:
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

Total Features with missing Values = 0

Count Number of Missing Value on Each Column:

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

0

Get Information on the feature variables:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
#   Column         Non-Null Count  Dtype
---  ------         --------------  -----
0   battery_power  2000 non-null   int64
1   blue           2000 non-null   int64
2   clock_speed    2000 non-null   float64
3   dual_sim       2000 non-null   int64
4   fc             2000 non-null   int64
5   four_g         2000 non-null   int64
6   int_memory     2000 non-null   int64
7   m_dep          2000 non-null   float64
8   mobile_wt      2000 non-null   int64
9   n_cores        2000 non-null   int64
10  pc             2000 non-null   int64
11  px_height      2000 non-null   int64
12  px_width       2000 non-null   int64
13  ram            2000 non-null   int64
14  sc_h           2000 non-null   int64
15  sc_w           2000 non-null   int64
16  talk_time      2000 non-null   int64
17  three_g        2000 non-null   int64
18  touch_screen   2000 non-null   int64
19  wifi           2000 non-null   int64
dtypes: float64(2), int64(18)
memory usage: 312.6 KB
None

battery_power     blue  clock_speed  dual_sim       fc   four_g  \
count        2000.00  2000.00      2000.00   2000.00  2000.00  2000.00
mean         1238.52     0.49         1.52      0.51     4.31     0.52
std           439.42     0.50         0.82      0.50     4.34     0.50
min           501.00     0.00         0.50      0.00     0.00     0.00
25%           851.75     0.00         0.70      0.00     1.00     0.00
50%          1226.00     0.00         1.50      1.00     3.00     1.00
75%          1615.25     1.00         2.20      1.00     7.00     1.00
max          1998.00     1.00         3.00      1.00    19.00     1.00

int_memory    m_dep  mobile_wt  n_cores       pc  px_height  px_width  \
count     2000.00  2000.00    2000.00  2000.00  2000.00    2000.00   2000.00
mean        32.05     0.50     140.25     4.52     9.92     645.11   1251.52
std         18.15     0.29      35.40     2.29     6.06     443.78    432.20
min          2.00     0.10      80.00     1.00     0.00       0.00    500.00
25%         16.00     0.20     109.00     3.00     5.00     282.75    874.75
50%         32.00     0.50     141.00     4.00    10.00     564.00   1247.00
75%         48.00     0.80     170.00     7.00    15.00     947.25   1633.00
max         64.00     1.00     200.00     8.00    20.00    1960.00   1998.00

ram     sc_h     sc_w  talk_time  three_g  touch_screen     wifi
count  2000.00  2000.00  2000.00    2000.00  2000.00        2000.0  2000.00
mean   2124.21    12.31     5.77      11.01     0.76           0.5     0.51
std    1084.73     4.21     4.36       5.46     0.43           0.5     0.50
min     256.00     5.00     0.00       2.00     0.00           0.0     0.00
25%    1207.50     9.00     2.00       6.00     1.00           0.0     0.00
50%    2146.50    12.00     5.00      11.00     1.00           1.0     1.00
75%    3064.50    16.00     9.00      16.00     1.00           1.0     1.00
max    3998.00    19.00    18.00      20.00     1.00           1.0     1.00

battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02

four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03

px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02

three_g  touch_screen      wifi
battery_power  1.15e-02     -1.05e-02 -8.34e-03
blue          -3.02e-02      1.01e-02 -2.19e-02
clock_speed   -4.64e-02      1.98e-02 -2.45e-02
dual_sim      -1.40e-02     -1.71e-02  2.27e-02
fc             1.79e-03     -1.48e-02  2.01e-02
four_g         5.84e-01      1.68e-02 -1.76e-02
int_memory    -9.37e-03     -2.70e-02  6.99e-03
m_dep         -1.21e-02     -2.64e-03 -2.84e-02
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04
n_cores       -1.47e-02      2.38e-02 -9.96e-03
pc            -1.32e-03     -8.74e-03  5.39e-03
px_height     -3.12e-02      2.19e-02  5.18e-02
px_width       3.50e-04     -1.63e-03  3.03e-02
ram            1.58e-02     -3.05e-02  2.27e-02
sc_h           1.20e-02     -2.00e-02  2.59e-02
sc_w           3.09e-02      1.27e-02  3.54e-02
talk_time     -4.27e-02      1.72e-02 -2.95e-02
three_g        1.00e+00      1.39e-02  4.32e-03
touch_screen   1.39e-02      1.00e+00  1.19e-02
wifi           4.32e-03      1.19e-02  1.00e+00

Ranking of Correlation Coefficients:
pairs  corr
75                    (fc, pc)  0.64
96           (four_g, three_g)  0.58
154      (px_height, px_width)  0.51
175               (sc_h, sc_w)  0.51
156          (px_height, sc_h)  0.06
..                         ...   ...
2    (battery_power, dual_sim) -0.04
184       (talk_time, three_g) -0.04
39       (clock_speed, four_g) -0.04
51      (clock_speed, three_g) -0.05
95         (four_g, talk_time) -0.05

[190 rows x 2 columns]

Highly correlated variables (Absolute Correlations):

fc             pc           0.64
four_g         three_g      0.58
px_height      px_width     0.51
sc_h           sc_w         0.51
px_height      sc_h         0.06
battery_power  talk_time    0.05
px_height      wifi         0.05
sc_w         0.04
dtype: float64

count    2000.00
mean        1.50
std         1.12
min         0.00
25%         0.75
50%         1.50
75%         2.25
max         3.00
Name: price_range, dtype: float64

price_range
0    500
1    500
2    500
3    500
dtype: int64

BOX plot of each numerical features

Histogram of each Numerical Feature

Correlation Matrix of All Numerical Features

Correlation plot of Numerical features
battery_power      blue  clock_speed  dual_sim        fc  \
battery_power       1.00e+00  1.13e-02     1.15e-02 -4.18e-02  3.33e-02
blue                1.13e-02  1.00e+00     2.14e-02  3.52e-02  3.59e-03
clock_speed         1.15e-02  2.14e-02     1.00e+00 -1.32e-03 -4.34e-04
dual_sim           -4.18e-02  3.52e-02    -1.32e-03  1.00e+00 -2.91e-02
fc                  3.33e-02  3.59e-03    -4.34e-04 -2.91e-02  1.00e+00
four_g              1.57e-02  1.34e-02    -4.31e-02  3.19e-03 -1.66e-02
int_memory         -4.00e-03  4.12e-02     6.55e-03 -1.57e-02 -2.91e-02
m_dep               3.41e-02  4.05e-03    -1.44e-02 -2.21e-02 -1.79e-03
mobile_wt           1.84e-03 -8.60e-03     1.23e-02 -8.98e-03  2.36e-02
n_cores            -2.97e-02  3.62e-02    -5.72e-03 -2.47e-02 -1.34e-02
pc                  3.14e-02 -9.95e-03    -5.25e-03 -1.71e-02  6.45e-01
px_height           1.49e-02 -6.87e-03    -1.45e-02 -2.09e-02 -9.99e-03
px_width           -8.40e-03 -4.15e-02    -9.48e-03  1.43e-02 -5.18e-03
ram                -6.53e-04  2.64e-02     3.44e-03  4.11e-02  1.51e-02
sc_h               -3.00e-02 -2.95e-03    -2.91e-02 -1.19e-02 -1.10e-02
sc_w               -2.14e-02  6.13e-04    -7.38e-03 -1.67e-02 -1.24e-02
talk_time           5.25e-02  1.39e-02    -1.14e-02 -3.94e-02 -6.83e-03
three_g             1.15e-02 -3.02e-02    -4.64e-02 -1.40e-02  1.79e-03
touch_screen       -1.05e-02  1.01e-02     1.98e-02 -1.71e-02 -1.48e-02
wifi               -8.34e-03 -2.19e-02    -2.45e-02  2.27e-02  2.01e-02

four_g  int_memory     m_dep  mobile_wt   n_cores        pc  \
battery_power  1.57e-02   -4.00e-03  3.41e-02   1.84e-03 -2.97e-02  3.14e-02
blue           1.34e-02    4.12e-02  4.05e-03  -8.60e-03  3.62e-02 -9.95e-03
clock_speed   -4.31e-02    6.55e-03 -1.44e-02   1.23e-02 -5.72e-03 -5.25e-03
dual_sim       3.19e-03   -1.57e-02 -2.21e-02  -8.98e-03 -2.47e-02 -1.71e-02
fc            -1.66e-02   -2.91e-02 -1.79e-03   2.36e-02 -1.34e-02  6.45e-01
four_g         1.00e+00    8.69e-03 -1.82e-03  -1.65e-02 -2.97e-02 -5.60e-03
int_memory     8.69e-03    1.00e+00  6.89e-03  -3.42e-02 -2.83e-02 -3.33e-02
m_dep         -1.82e-03    6.89e-03  1.00e+00   2.18e-02 -3.50e-03  2.63e-02
mobile_wt     -1.65e-02   -3.42e-02  2.18e-02   1.00e+00 -1.90e-02  1.88e-02
n_cores       -2.97e-02   -2.83e-02 -3.50e-03  -1.90e-02  1.00e+00 -1.19e-03
pc            -5.60e-03   -3.33e-02  2.63e-02   1.88e-02 -1.19e-03  1.00e+00
px_height     -1.92e-02    1.04e-02  2.53e-02   9.39e-04 -6.87e-03 -1.85e-02
px_width       7.45e-03   -8.33e-03  2.36e-02   8.98e-05  2.45e-02  4.20e-03
ram            7.31e-03    3.28e-02 -9.43e-03  -2.58e-03  4.87e-03  2.90e-02
sc_h           2.72e-02    3.78e-02 -2.53e-02  -3.39e-02 -3.15e-04  4.94e-03
sc_w           3.70e-02    1.17e-02 -1.84e-02  -2.08e-02  2.58e-02 -2.38e-02
talk_time     -4.66e-02   -2.79e-03  1.70e-02   6.21e-03  1.31e-02  1.47e-02
three_g        5.84e-01   -9.37e-03 -1.21e-02   1.55e-03 -1.47e-02 -1.32e-03
touch_screen   1.68e-02   -2.70e-02 -2.64e-03  -1.44e-02  2.38e-02 -8.74e-03
wifi          -1.76e-02    6.99e-03 -2.84e-02  -4.09e-04 -9.96e-03  5.39e-03

px_height  px_width       ram      sc_h      sc_w  talk_time  \
battery_power   1.49e-02 -8.40e-03 -6.53e-04 -3.00e-02 -2.14e-02   5.25e-02
blue           -6.87e-03 -4.15e-02  2.64e-02 -2.95e-03  6.13e-04   1.39e-02
clock_speed    -1.45e-02 -9.48e-03  3.44e-03 -2.91e-02 -7.38e-03  -1.14e-02
dual_sim       -2.09e-02  1.43e-02  4.11e-02 -1.19e-02 -1.67e-02  -3.94e-02
fc             -9.99e-03 -5.18e-03  1.51e-02 -1.10e-02 -1.24e-02  -6.83e-03
four_g         -1.92e-02  7.45e-03  7.31e-03  2.72e-02  3.70e-02  -4.66e-02
int_memory      1.04e-02 -8.33e-03  3.28e-02  3.78e-02  1.17e-02  -2.79e-03
m_dep           2.53e-02  2.36e-02 -9.43e-03 -2.53e-02 -1.84e-02   1.70e-02
mobile_wt       9.39e-04  8.98e-05 -2.58e-03 -3.39e-02 -2.08e-02   6.21e-03
n_cores        -6.87e-03  2.45e-02  4.87e-03 -3.15e-04  2.58e-02   1.31e-02
pc             -1.85e-02  4.20e-03  2.90e-02  4.94e-03 -2.38e-02   1.47e-02
px_height       1.00e+00  5.11e-01 -2.04e-02  5.96e-02  4.30e-02  -1.06e-02
px_width        5.11e-01  1.00e+00  4.11e-03  2.16e-02  3.47e-02   6.72e-03
ram            -2.04e-02  4.11e-03  1.00e+00  1.60e-02  3.56e-02   1.08e-02
sc_h            5.96e-02  2.16e-02  1.60e-02  1.00e+00  5.06e-01  -1.73e-02
sc_w            4.30e-02  3.47e-02  3.56e-02  5.06e-01  1.00e+00  -2.28e-02
talk_time      -1.06e-02  6.72e-03  1.08e-02 -1.73e-02 -2.28e-02   1.00e+00
three_g        -3.12e-02  3.50e-04  1.58e-02  1.20e-02  3.09e-02  -4.27e-02
touch_screen    2.19e-02 -1.63e-03 -3.05e-02 -2.00e-02  1.27e-02   1.72e-02
wifi            5.18e-02  3.03e-02  2.27e-02  2.59e-02  3.54e-02  -2.95e-02

three_g  touch_screen      wifi
battery_power  1.15e-02     -1.05e-02 -8.34e-03
blue          -3.02e-02      1.01e-02 -2.19e-02
clock_speed   -4.64e-02      1.98e-02 -2.45e-02
dual_sim      -1.40e-02     -1.71e-02  2.27e-02
fc             1.79e-03     -1.48e-02  2.01e-02
four_g         5.84e-01      1.68e-02 -1.76e-02
int_memory    -9.37e-03     -2.70e-02  6.99e-03
m_dep         -1.21e-02     -2.64e-03 -2.84e-02
mobile_wt      1.55e-03     -1.44e-02 -4.09e-04
n_cores       -1.47e-02      2.38e-02 -9.96e-03
pc            -1.32e-03     -8.74e-03  5.39e-03
px_height     -3.12e-02      2.19e-02  5.18e-02
px_width       3.50e-04     -1.63e-03  3.03e-02
ram            1.58e-02     -3.05e-02  2.27e-02
sc_h           1.20e-02     -2.00e-02  2.59e-02
sc_w           3.09e-02      1.27e-02  3.54e-02
talk_time     -4.27e-02      1.72e-02 -2.95e-02
three_g        1.00e+00      1.39e-02  4.32e-03
touch_screen   1.39e-02      1.00e+00  1.19e-02
wifi           4.32e-03      1.19e-02  1.00e+00

PIE Chart of for Target:

[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'], ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']]

Random state :  32
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.921875

Random state :  41
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.93125

Random state :  45
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.89375

Random state :  52
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9125

Random state :  65
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.925

Random state :  72
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.915625

Random state :  96
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.90625

Random state :  97
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.925

Random state :  112
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'] ; Outer Test ACC:  0.909375

Random state :  114
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.909375

Random state :  128
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.928125

Random state :  142
Selected features: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'] ; Outer Test ACC:  0.9

[0.921875, 0.93125, 0.89375, 0.9125, 0.925, 0.915625, 0.90625, 0.925, 0.909375, 0.909375, 0.928125, 0.9]

Maximum Accuracy Index:  1

Best model parameters with random_state:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)

Best feature combination:
['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Best accuracy from MCCV:
0.93125

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 200, 'n_jobs': 1, 'nthread': None, 'objective': 'multi:softprob', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1}

Evaluation of the trained model:

Accuracy :  0.8925

Kappa Score :  0.8562894264109955

Confusion Matrix :
[[ 89   4   0   0]
[  7  91   4   0]
[  0   9  73   5]
[  0   0  14 104]]

Classification Report :
precision    recall  f1-score   support

0       0.93      0.96      0.94        93
1       0.88      0.89      0.88       102
2       0.80      0.84      0.82        87
3       0.95      0.88      0.92       118

avg / total       0.89      0.89      0.89       400


Feature Importance/Rank Analysis:
1. feature 10 ram (0.360073)
2. feature 0 battery_power (0.184809)
3. feature 9 px_width (0.144192)
4. feature 8 px_height (0.144192)
5. feature 13 talk_time (0.031885)
6. feature 7 pc (0.030260)
7. feature 12 sc_w (0.027417)
8. feature 2 clock_speed (0.022340)
9. feature 4 fc (0.017669)
10. feature 11 sc_h (0.013607)
11. feature 6 n_cores (0.011576)
12. feature 3 dual_sim (0.003859)
13. feature 15 touch_screen (0.003249)
14. feature 16 wifi (0.001625)
15. feature 1 blue (0.001422)
16. feature 14 three_g (0.001015)
17. feature 5 four_g (0.000812)

(2000, 21)
battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6
1           1021     1          0.5         1   0       1          53    0.7
2            563     1          0.5         1   2       1          41    0.9
3            615     1          2.5         0   0       0          10    0.8
4           1821     1          1.2         0  13       1          44    0.6

mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19
1        136        3  ...        905      1988  2631    17     3          7
2        145        5  ...       1263      1716  2603    11     2          9
3        131        6  ...       1216      1786  2769    16     8         11
4        141        2  ...       1208      1212  1411     8     2         15

three_g  touch_screen  wifi  price_range
0        0             0     1            1
1        1             1     0            2
2        1             1     0            2
3        1             0     0            2
4        1             1     0            1

[5 rows x 21 columns]
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

Testing Results of the trained model:

Accuracy :  0.9675

Kappa Score :  0.9566666666666667

Confusion Matrix :
[[495   5   0   0]
[ 13 481   6   0]
[  0  15 476   9]
[  0   0  17 483]]

Classification Report :
precision    recall  f1-score   support

0       0.97      0.99      0.98       500
1       0.96      0.96      0.96       500
2       0.95      0.95      0.95       500
3       0.98      0.97      0.97       500

avg / total       0.97      0.97      0.97      2000


(1000, 21)
id  battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0   1           1043     1          1.8         1  14       0           5
1   2            841     1          0.5         1   4       1          61
2   3           1807     1          2.8         0   1       0          27
3   4           1546     0          0.5         1  18       1          25
4   5           1434     0          1.4         0  11       1          49

m_dep  mobile_wt  ...  pc  px_height  px_width   ram  sc_h  sc_w  \
0    0.1        193  ...  16        226      1412  3476    12     7
1    0.8        191  ...  12        746       857  3895     6     0
2    0.9        186  ...   4       1270      1366  2396    17    10
3    0.5         96  ...  20        295      1752  3893    10     0
4    0.5        108  ...  18        749       810  1773    15     8

talk_time  three_g  touch_screen  wifi
0          2        0             1     0
1          7        1             0     0
2         10        0             1     1
3          7        1             1     0
4          7        1             0     1

[5 rows x 21 columns]
Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi'],
dtype='object')
id                 int64
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
dtype: object

Execution Time 204.0043179988861 seconds:

In [ ]: