def DSC_Recipe_8():
print()
print(format('Recipe for Data Science Competition - DSC_Recipe_8','*^65'))
print(format('Classification with OpenML mobileset price dataset using XGBoost & Monte Carlo Cross Validation', '*^95'))
print(format('Package: scikit-learn ','*^65'))
print(format('Model: XGBoost Model','*^65'))
print(format('DataSet: OpenML mobileset price Dataset', '*^65'))
print(format('Model selection: using Monte Carlo Cross Validation (MCCV) / Repeated Random Sub-Sampling Validation (RRSSV)', '*^95'))
# load necessary libraries
import time
import pandas as pd
import pickle as pk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import warnings
start_time = time.time()
# -------------------------------------------------------------------------
# Helper modules for Descriptive Statistics
# -------------------------------------------------------------------------
def get_redundant_pairs(df):
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
pairs_to_drop.add((cols[i], cols[j]))
return pairs_to_drop
def get_top_abs_correlations(df, n=5):
#au_corr = df.corr().abs().unstack()
au_corr = df.corr().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
return au_corr[0:n]
def corrank(X):
import itertools
df = pd.DataFrame([[(i,j),
X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
columns=['pairs','corr'])
print(df.sort_values(by='corr',ascending=False))
print()
# Helper module for Label Encoding for Categorical Features
def dummyEncode(df):
columnsToEncode = list(df.select_dtypes(include=['category',
'object']))
le = LabelEncoder()
for feature in columnsToEncode:
try:
df[feature] = le.fit_transform(df[feature])
except:
print('Error encoding '+feature)
return df
# -------------------------------------------------------------------------
# load dataset
# -------------------------------------------------------------------------
def load_dataset(filename):
dataset = pd.read_csv(filename, sep = ',')
print(dataset.shape); print(dataset.head(5)); print(dataset.columns);
print(dataset.dtypes)
feature_names = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
target = 'price_range'
dataset = dummyEncode(dataset[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range']])
return feature_names, target, dataset
# -------------------------------------------------------------------------
# find missing values in dataset if exists
# -------------------------------------------------------------------------
def find_miising_value(feature_names, target, dataset):
print()
print('#---------------------------------------------------------------')
print('Check for Mising Value or NaN Value in the Dataset')
print('#---------------------------------------------------------------')
# Method - 1
# Count Number of Missing Value on Each Column
print('\nCount Number of Missing Value on Each Column: ')
print(dataset.isnull().sum(axis=0))
# Count Number of Missing Value on Each Row
#print('\nCount Number of Missing Value on Each Row: ')
#print(dataset.isnull().sum(axis=1))
# Method - 2
# Check if there are any missing values in Dataset
feature_count = dataset.columns[dataset.isnull().sum() != 0].size
print()
print("Total Features with missing Values = " + str(feature_count))
if (feature_count):
print()
print("Features with NaN => {}".format(list(dataset.columns[dataset.isnull().sum() != 0])))
print('Count Number of Missing Value on Each Column: ')
print(dataset[dataset.columns[dataset.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False))
print()
print('#---------------------------------------------------------------')
print('Check and Remove constant columns in the Dataset')
print('#---------------------------------------------------------------')
colsToRemove = []
for col in dataset.columns:
if col not in ['price_range']:
if dataset[col].std() == 0:
colsToRemove.append(col)
print()
print("Removed `{}` Constant Columns: ".format(len(colsToRemove)))
print(colsToRemove)
# remove constant columns in the Dataset
dataset.drop(colsToRemove, axis=1, inplace=True)
print()
print('#---------------------------------------------------------------')
print('Check and Remove Duplicate Columns in the Dataset')
print('#---------------------------------------------------------------')
print()
print(dataset.columns); print(dataset.head(5))
print('\nDuplicate Columns in the Dataset: \n', dataset.columns.duplicated())
dataset = dataset.loc[:, ~dataset.columns.duplicated()]
print()
print(dataset.columns); print(dataset.head(5))
print()
print('#---------------------------------------------------------------')
print('Check and Drop Sparse Data/Columns in the Dataset')
print('#---------------------------------------------------------------')
flist = [x for x in dataset.columns if not x in ['price_range']]
print(); print(flist)
for f in flist:
if len(np.unique(dataset[f])) < 2:
print('Feature contains Sparse Data: ', f)
dataset.drop(f, axis=1, inplace=True)
print()
print(dataset.columns); print(dataset.head(5))
# --------------------------------------------------
# Missing Values treatment in the DataSet (if any)
# --------------------------------------------------
# a) Filling NULL values with Zeros
#dataset = dataset.fillna(0)
#print('\nCount Number of Missing Value on Each Column: ')
## Count Number of Missing Value on Each Column
#print(dataset.isnull().sum(axis=0))
#print('\nCount Number of Missing Value on Each Row: ')
## Count Number of Missing Value on Each Row
#print(dataset.isnull().sum(axis=1))
# b) Filling NULL values according to their dataTypes
# Group Dataset according to different dataTypes
gd = dataset.columns.to_series().groupby(dataset.dtypes).groups
print('\nGroup Columns according to their dataTypes: \n', gd)
colNames = dataset.columns.values.tolist()
for colName in colNames:
if dataset[colName].dtypes == 'int64':
dataset[colName] = dataset[colName].fillna(0)
if dataset[colName].dtypes == 'float64':
dataset[colName] = dataset[colName].fillna(0.0)
if dataset[colName].dtypes == 'object':
dataset[colName] = dataset[colName].fillna('Unknown')
## Count Number of Missing Value on Each Column
print('\nCount Number of Missing Value on Each Column: ')
print(dataset.isnull().sum(axis=0))
## Count Number of Missing Value on Each Row
#print('\nCount Number of Missing Value on Each Row: ')
#print(dataset.isnull().sum(axis=1))
# Check if there are any missing values in Dataset
feature_count = dataset.columns[dataset.isnull().sum() != 0].size
print()
print("Total Features with missing Values = " + str(feature_count))
return(dataset)
# -------------------------------------------------------------------------
# descriptive statistics and correlation matrix
# -------------------------------------------------------------------------
def data_descriptiveStats(feature_names, target, dataset):
# Count Number of Missing Value on Each Column
print(); print('Count Number of Missing Value on Each Column: ')
print(); print(dataset[feature_names].isnull().sum(axis=0))
print(); print(dataset[target].isnull().sum(axis=0))
# Get Information on the feature variables
print(); print('Get Information on the feature variables: ')
print(); print(dataset[feature_names].info())
print(); print(dataset[feature_names].describe())
# correlation
pd.set_option('precision', 2)
print(); print(dataset[feature_names].corr())
# Ranking of Correlation Coefficients among Variable Pairs
print(); print("Ranking of Correlation Coefficients:")
corrank(dataset[feature_names])
# Print Highly Correlated Variables
print(); print("Highly correlated variables (Absolute Correlations):")
print(); print(get_top_abs_correlations(dataset[feature_names], 8))
# Get Information on the target
print(); print(dataset[target].describe())
print(); print(dataset.groupby(target).size())
# -------------------------------------------------------------------------
# data visualisation and correlation graph
# -------------------------------------------------------------------------
def data_visualization(feature_names, target, dataset):
# BOX plots USING box and whisker plots
i = 1
print(); print('BOX plot of each numerical features')
plt.figure(figsize=(11,9))
for col in feature_names:
plt.subplot(5,4,i)
plt.axis('on')
plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True,
labelleft=False, labeltop=False, labelright=False, labelbottom=False)
dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
i += 1
plt.show()
# USING histograms
j = 1
print(); print('Histogram of each Numerical Feature')
plt.figure(figsize=(11,9))
for col in feature_names:
plt.subplot(5,4,j)
plt.axis('on')
plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False,
labelleft=False, labeltop=False, labelright=False, labelbottom=False)
dataset[col].hist()
j += 1
plt.show()
# correlation matrix
print(); print('Correlation Matrix of All Numerical Features')
fig = plt.figure(figsize=(11,9))
ax = fig.add_subplot(111)
cax = ax.matshow(dataset[feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0,20,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
plt.show()
# Correlation Plot using seaborn
print(); print("Correlation plot of Numerical features")
# Compute the correlation matrix
corr = dataset[feature_names].corr()
print(corr)
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin= -1.0, center=0, square=True,
linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
# Pie chart for Categorical Variables
print(); print('PIE Chart of for Target: ')
plt.figure(figsize=(11,9))
i = 1
for colName in [target]:
labels = []; sizes = [];
df = dataset.groupby(colName).size()
for key in df.keys():
labels.append(key)
sizes.append(df[key])
# Plot PIE Chart with %
plt.subplot(2,2,i)
plt.axis('on')
plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False,
labelleft=True, labeltop=True, labelright=False, labelbottom=False)
plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
i += 1; plt.savefig('Piefig.pdf', format='pdf')
plt.show()
# -------------------------------------------------------------------------
# data split to train and test datasets
# -------------------------------------------------------------------------
def data_split(feature_names, target, dataset):
# Data Transform - Split train : test datasets
X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names],
dataset.loc[:, target], test_size=0.20)
return X_train, X_test, y_train, y_test
def training_model(X_train, y_train):
_value = []; _model = []; _best_features = [];
# Create different Feature subsets
F1 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
F2 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
F3 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'touch_screen', 'wifi']
F4 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g'
]
F5 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
F6 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
F7 = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'n_cores', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi']
subsets_sum = [F1] + [F2] + [F3] + [F4] + [F5] + [F6] + [F7]
print(subsets_sum)
# Twelve random sates randomly choosen for the outer-MCCV
for i in [32,41,45,52,65,72,96,97,112,114,128,142]:
warnings.filterwarnings("ignore")
print ('\n\nRandom state : ', i)
model = xgboost.XGBClassifier(objective = 'binary:logistic', n_estimators=200)
# Split the dataset into two stratified parts, 80% for Outer training set
X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train,
train_size=0.8, random_state=i, stratify=y_train)
# Choose k-fold cross-validation technique for the inner loop
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
# Set temporary variables
best_subset = []
best_auc = -np.inf
# Loop over the features combinations
for subset in subsets_sum:
score = cross_val_score(model, X=X1_train[subset], y=y1_train,
cv = inner_cv.split(X1_train[subset], y1_train),
scoring='accuracy')
if score.mean() > best_auc:
best_auc = score.mean()
best_subset = subset
# Train the model on the Outer training set with the selected feature combination
model = model.fit(X1_train[best_subset], y1_train)
# Calculate the predicted labels with the model on the Outer test set with the selected feature combination
y1_pred = model.predict(X1_test[best_subset])
# Calculate the accuracy between predicted and true labels
acc = accuracy_score(y1_test, y1_pred)
print('Selected features:', best_subset,'; Outer Test ACC: ',acc)
_best_features.append(best_subset); _value.append(acc); _model.append(model);
#for i in range(0, len(_value)):
# print(); print(_best_features[i]); print('Accuracy: ',_value[i]); print(_model[i])
print(); print(_value)
print(); print('Maximum Accuracy Index: ', np.argmax(_value))
idx = np.argmax(_value)
print("\nBest model parameters with random_state:"); print(_model[idx])
print("\nBest feature combination:"); print(_best_features[idx])
print("\nBest accuracy from MCCV:"); print(_value[idx])
return(_model[idx], _best_features[idx])
def evaluate_model(model, features, X_train, y_train, X_test, y_test):
print()
print(model.get_params(deep=True))
# Evaluate the skill of the Trained model
pred_Class = model.predict(X_test[features])
acc = accuracy_score(y_test, pred_Class)
classReport = classification_report(y_test, pred_Class)
confMatrix = confusion_matrix(y_test, pred_Class)
kappa_score = cohen_kappa_score(y_test, pred_Class)
print(); print('Evaluation of the trained model: ')
print(); print('Accuracy : ', acc)
print(); print('Kappa Score : ', kappa_score)
print(); print('Confusion Matrix :\n', confMatrix)
print(); print('Classification Report :\n',classReport)
pred_proba = model.predict_proba(X_test[features])
# Add more plots here using scikit-plot
# ROC curves
skplt.metrics.plot_roc(y_test,pred_proba,figsize=(8,6)); plt.show()
# Confusion matrix
skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(6,6)); plt.show()
# precision recall curve
skplt.metrics.plot_precision_recall(y_test, pred_proba,
title='Precision-Recall Curve', plot_micro=True,
classes_to_plot=None, ax=None, figsize=(9,6),
cmap='nipy_spectral', title_fontsize='large',
text_fontsize='medium'); plt.show()
# Add more ... ... ...
# plot learning Curves
#skplt.estimators.plot_learning_curve(model, X_train[features], y_train, figsize=(6,6))
#plt.show()
return model
def featureRank_Analysis(model, dataset, cols):
print()
print("Feature Importance/Rank Analysis: ")
X = dataset.loc[:, cols]; X_cols = X.columns.values
features_imp = model.feature_importances_
indices = np.argsort(features_imp)[::-1]
df = {}
for f in range(X.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_cols[indices[f]],
features_imp[indices[f]]))
df[f] = [f + 1, indices[f], X_cols[indices[f]], features_imp[indices[f]]]
df1 = pd.DataFrame.from_dict(df, orient = 'index')
df1.columns = ['feature_Rank', 'feature_Index', 'feature_Name', 'feature_importance']
df1.to_csv("FeatureImportanceRank.csv", index = False)
# this creates a figure 5 inch wide, 3 inch high
plt.figure(figsize=(8,8))
plt.barh(df1['feature_Rank'], df1['feature_importance'], tick_label = df1['feature_Name'])
plt.savefig('Featurefig.pdf', format='pdf')
plt.show()
skplt.estimators.plot_feature_importances(model, feature_names=cols,
x_tick_rotation = 90, figsize=(8,8))
plt.show()
# ------------------------------------------------
# Visualise the tree-graph (GradientBoosting)
# ------------------------------------------------
# install graphViz and pydotplus using pip
# install binaries from graphViz.org and
# add PATH variables
# Follow the instruction @
# https://stackoverflow.com/questions/18438997/
# why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
# ------------------------------------------------
# Get an arbitary tree number between (0,99)
# as "n_estimators = 100"
sub_tree_number = 27
# plot tree from Left to Right
xgboost.plot_tree(model, num_trees=sub_tree_number, rankdir='LR')
fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
fig.savefig('treeOpenML mobileset price-1.png')
# plot tree top to bottom
xgboost.plot_tree(model, num_trees=sub_tree_number)
fig = plt.gcf(); fig.set_size_inches(9, 9); plt.show()
fig.savefig('treeOpenML mobileset price-2.png')
# plot feature importance
xgboost.plot_importance(model); plt.show()
def save_model(model):
with open('DSC_Recipe_8_model.pickle', 'wb') as f:
pk.dump(model, f)
def final_prediction(feature_names, filename):
# load model
f = open('DSC_Recipe_8_model.pickle', 'rb')
model = pk.load(f); f.close();
# load dataset
dataset = pd.read_csv(filename, sep = ',')
print(dataset.shape); print(dataset.head(5)); print(dataset.columns);
print(dataset.dtypes)
dataset = dummyEncode(dataset)
# final prediction and results
predicted_class = model.predict(dataset[feature_names])
pred_proba = model.predict_proba(dataset[feature_names])
dataset['predicted_class'] = predicted_class
# Evaluate the skill of the Trained model
acc = accuracy_score(dataset['price_range'], predicted_class)
classReport = classification_report(dataset['price_range'], predicted_class)
confMatrix = confusion_matrix(dataset['price_range'], predicted_class)
kappa_score = cohen_kappa_score(dataset['price_range'], predicted_class)
print(); print('Testing Results of the trained model: ')
print(); print('Accuracy : ', acc)
print(); print('Kappa Score : ', kappa_score)
print(); print('Confusion Matrix :\n', confMatrix)
print(); print('Classification Report :\n',classReport)
# ROC curves
skplt.metrics.plot_roc(dataset['price_range'],pred_proba,figsize=(7,7)); plt.show()
# Confusion matrix
skplt.metrics.plot_confusion_matrix(dataset['price_range'],
predicted_class,figsize=(7,7)); plt.show()
# precision recall curve
skplt.metrics.plot_precision_recall(dataset['price_range'], pred_proba,
title='Precision-Recall Curve', plot_micro=True,
classes_to_plot=None, ax=None, figsize=(7,7),
cmap='nipy_spectral', title_fontsize='large',
text_fontsize='medium'); plt.show()
dataset.to_csv('FinalResult.csv', index = False,
columns = ['price_range', 'predicted_class'])
def final_prediction_with_testDataset(feature_names, filename):
# load model
f = open('DSC_Recipe_8_model.pickle', 'rb')
model = pk.load(f); f.close();
# load dataset
dataset = pd.read_csv(filename, sep = ',')
print(dataset.shape); print(dataset.head(5)); print(dataset.columns);
print(dataset.dtypes)
dataset = dummyEncode(dataset)
# final prediction and results
predicted_class = model.predict(dataset[feature_names])
pred_proba = model.predict_proba(dataset[feature_names])
dataset['predicted_class'] = predicted_class
dataset['predicted_proba'] = pred_proba.tolist()
dataset.to_csv('FinalResultWith_testDataset.csv', index = False)
if __name__ == '__main__':
print()
print("Execution Time %s seconds: " % (start_time))
filename = 'mobilePriceClassification_trainDataset.csv'
feature_names, target, dataset = load_dataset(filename)
dataset = find_miising_value(feature_names, target, dataset)
data_descriptiveStats(feature_names, target, dataset)
data_visualization(feature_names, target, dataset)
X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)
model, features = training_model(X_train, y_train)
model = evaluate_model(model, features, X_train, y_train, X_test, y_test)
featureRank_Analysis(model, dataset, features)
save_model(model)
test_filename = 'mobilePriceClassification_trainDataset.csv'
final_prediction(features, test_filename)
test_filename = 'mobilePriceClassification_testDataset.csv'
final_prediction_with_testDataset(features, test_filename)
print()
print("Execution Time %s seconds: " % (time.time() - start_time))