For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'employee'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | promotion_last_5years | department | salary | left | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | sales | low | 1 |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 0 | sales | medium | 1 |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | sales | medium | 1 |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | sales | low | 1 |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | sales | low | 1 |
dataset.shape
(14999, 10)
dataset.columns.to_list()
['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'department', 'salary', 'left']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (11249, 10) Unseen Data For Predictions: (3750, 10)
env_setup = setup(data = data, target = 'left', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | left |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (11249, 10) |
5 | Missing Values | False |
6 | Numeric Features | 3 |
7 | Categorical Features | 6 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (7874, 32) |
12 | Transformed Test Set | (3375, 32) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 9cc7 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.9864 | 0.9890 | 0.9528 | 0.9901 | 0.9711 | 0.9622 | 0.9626 | 0.4150 |
lightgbm | Light Gradient Boosting Machine | 0.9836 | 0.9914 | 0.9528 | 0.9783 | 0.9654 | 0.9546 | 0.9548 | 0.0610 |
xgboost | Extreme Gradient Boosting | 0.9831 | 0.9901 | 0.9518 | 0.9774 | 0.9643 | 0.9532 | 0.9535 | 0.5000 |
catboost | CatBoost Classifier | 0.9818 | 0.9904 | 0.9417 | 0.9820 | 0.9613 | 0.9495 | 0.9499 | 2.2450 |
et | Extra Trees Classifier | 0.9802 | 0.9869 | 0.9454 | 0.9713 | 0.9581 | 0.9451 | 0.9454 | 0.4100 |
gbc | Gradient Boosting Classifier | 0.9751 | 0.9863 | 0.9232 | 0.9717 | 0.9467 | 0.9305 | 0.9311 | 0.3200 |
dt | Decision Tree Classifier | 0.9742 | 0.9682 | 0.9565 | 0.9373 | 0.9468 | 0.9298 | 0.9299 | 0.0190 |
ada | Ada Boost Classifier | 0.9589 | 0.9803 | 0.9057 | 0.9214 | 0.9134 | 0.8864 | 0.8865 | 0.1290 |
knn | K Neighbors Classifier | 0.9314 | 0.9646 | 0.8945 | 0.8321 | 0.8620 | 0.8165 | 0.8176 | 0.0880 |
lr | Logistic Regression | 0.8945 | 0.9349 | 0.7949 | 0.7720 | 0.7829 | 0.7133 | 0.7137 | 0.6080 |
ridge | Ridge Classifier | 0.8778 | 0.0000 | 0.7901 | 0.7259 | 0.7560 | 0.6748 | 0.6764 | 0.0110 |
lda | Linear Discriminant Analysis | 0.8778 | 0.9272 | 0.8341 | 0.7090 | 0.7660 | 0.6841 | 0.6887 | 0.0270 |
svm | SVM - Linear Kernel | 0.7806 | 0.0000 | 0.7394 | 0.5726 | 0.5833 | 0.4683 | 0.5077 | 0.0660 |
dummy | Dummy Classifier | 0.7604 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0080 |
nb | Naive Bayes | 0.7136 | 0.9135 | 0.9587 | 0.4543 | 0.6163 | 0.4313 | 0.5083 | 0.0120 |
qda | Quadratic Discriminant Analysis | 0.5244 | 0.5949 | 0.7304 | 0.3243 | 0.4244 | 0.1405 | 0.1826 | 0.0150 |
xgboost = create_model('xgboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9873 | 0.9934 | 0.9735 | 0.9735 | 0.9735 | 0.9652 | 0.9652 |
1 | 0.9810 | 0.9847 | 0.9524 | 0.9677 | 0.9600 | 0.9475 | 0.9476 |
2 | 0.9708 | 0.9893 | 0.9259 | 0.9511 | 0.9383 | 0.9192 | 0.9194 |
3 | 0.9860 | 0.9940 | 0.9471 | 0.9944 | 0.9702 | 0.9611 | 0.9616 |
4 | 0.9822 | 0.9880 | 0.9309 | 0.9943 | 0.9615 | 0.9500 | 0.9509 |
5 | 0.9797 | 0.9864 | 0.9521 | 0.9624 | 0.9572 | 0.9439 | 0.9439 |
6 | 0.9873 | 0.9937 | 0.9628 | 0.9837 | 0.9731 | 0.9648 | 0.9649 |
7 | 0.9860 | 0.9876 | 0.9683 | 0.9734 | 0.9708 | 0.9616 | 0.9616 |
8 | 0.9848 | 0.9924 | 0.9418 | 0.9944 | 0.9674 | 0.9575 | 0.9581 |
9 | 0.9860 | 0.9917 | 0.9630 | 0.9785 | 0.9707 | 0.9615 | 0.9615 |
Mean | 0.9831 | 0.9901 | 0.9518 | 0.9774 | 0.9643 | 0.9532 | 0.9535 |
SD | 0.0048 | 0.0032 | 0.0149 | 0.0140 | 0.0102 | 0.0134 | 0.0134 |
tuned_xgboost = tune_model(xgboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9873 | 0.9958 | 0.9735 | 0.9735 | 0.9735 | 0.9652 | 0.9652 |
1 | 0.9835 | 0.9836 | 0.9577 | 0.9731 | 0.9653 | 0.9545 | 0.9546 |
2 | 0.9721 | 0.9868 | 0.9365 | 0.9465 | 0.9415 | 0.9232 | 0.9232 |
3 | 0.9822 | 0.9902 | 0.9577 | 0.9679 | 0.9628 | 0.9511 | 0.9511 |
4 | 0.9822 | 0.9849 | 0.9415 | 0.9833 | 0.9620 | 0.9504 | 0.9507 |
5 | 0.9822 | 0.9827 | 0.9628 | 0.9628 | 0.9628 | 0.9511 | 0.9511 |
6 | 0.9771 | 0.9905 | 0.9681 | 0.9381 | 0.9529 | 0.9378 | 0.9380 |
7 | 0.9822 | 0.9828 | 0.9683 | 0.9581 | 0.9632 | 0.9514 | 0.9515 |
8 | 0.9797 | 0.9885 | 0.9577 | 0.9577 | 0.9577 | 0.9443 | 0.9443 |
9 | 0.9822 | 0.9889 | 0.9524 | 0.9730 | 0.9626 | 0.9509 | 0.9510 |
Mean | 0.9811 | 0.9875 | 0.9576 | 0.9634 | 0.9604 | 0.9480 | 0.9481 |
SD | 0.0039 | 0.0040 | 0.0111 | 0.0130 | 0.0080 | 0.0106 | 0.0106 |
print(tuned_xgboost)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)
plot_model(tuned_xgboost, plot = 'auc')
plot_model(tuned_xgboost, plot = 'pr')
plot_model(tuned_xgboost, plot='feature')
plot_model(tuned_xgboost, plot = 'confusion_matrix')
plot_model(tuned_xgboost, plot = 'learning')
plot_model(tuned_xgboost, plot = 'threshold')
rf = create_model('rf')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9898 | 0.9955 | 0.9683 | 0.9892 | 0.9786 | 0.9720 | 0.9721 |
1 | 0.9848 | 0.9856 | 0.9471 | 0.9890 | 0.9676 | 0.9576 | 0.9580 |
2 | 0.9746 | 0.9837 | 0.9312 | 0.9617 | 0.9462 | 0.9296 | 0.9298 |
3 | 0.9873 | 0.9883 | 0.9524 | 0.9945 | 0.9730 | 0.9647 | 0.9651 |
4 | 0.9860 | 0.9901 | 0.9415 | 1.0000 | 0.9699 | 0.9608 | 0.9615 |
5 | 0.9873 | 0.9900 | 0.9574 | 0.9890 | 0.9730 | 0.9647 | 0.9649 |
6 | 0.9898 | 0.9868 | 0.9681 | 0.9891 | 0.9785 | 0.9718 | 0.9719 |
7 | 0.9924 | 0.9906 | 0.9683 | 1.0000 | 0.9839 | 0.9789 | 0.9791 |
8 | 0.9835 | 0.9885 | 0.9418 | 0.9889 | 0.9648 | 0.9540 | 0.9545 |
9 | 0.9886 | 0.9913 | 0.9524 | 1.0000 | 0.9756 | 0.9681 | 0.9686 |
Mean | 0.9864 | 0.9890 | 0.9528 | 0.9901 | 0.9711 | 0.9622 | 0.9626 |
SD | 0.0046 | 0.0031 | 0.0122 | 0.0106 | 0.0099 | 0.0129 | 0.0129 |
tuned_rf = tune_model(rf)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9708 | 0.9931 | 0.9153 | 0.9611 | 0.9377 | 0.9186 | 0.9191 |
1 | 0.9759 | 0.9860 | 0.9418 | 0.9570 | 0.9493 | 0.9335 | 0.9336 |
2 | 0.9607 | 0.9807 | 0.8783 | 0.9540 | 0.9146 | 0.8891 | 0.8904 |
3 | 0.9721 | 0.9873 | 0.9101 | 0.9718 | 0.9399 | 0.9217 | 0.9226 |
4 | 0.9746 | 0.9825 | 0.8989 | 0.9941 | 0.9441 | 0.9277 | 0.9297 |
5 | 0.9720 | 0.9870 | 0.9202 | 0.9611 | 0.9402 | 0.9220 | 0.9224 |
6 | 0.9822 | 0.9886 | 0.9468 | 0.9780 | 0.9622 | 0.9505 | 0.9508 |
7 | 0.9822 | 0.9877 | 0.9524 | 0.9730 | 0.9626 | 0.9509 | 0.9510 |
8 | 0.9733 | 0.9875 | 0.9048 | 0.9828 | 0.9421 | 0.9248 | 0.9262 |
9 | 0.9771 | 0.9905 | 0.9206 | 0.9831 | 0.9508 | 0.9359 | 0.9368 |
Mean | 0.9741 | 0.9871 | 0.9189 | 0.9716 | 0.9444 | 0.9275 | 0.9282 |
SD | 0.0059 | 0.0034 | 0.0218 | 0.0124 | 0.0130 | 0.0168 | 0.0165 |
print(tuned_rf)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(tuned_rf, plot = 'auc')
plot_model(tuned_rf, plot = 'pr')
plot_model(tuned_rf, plot = 'feature')
plot_model(tuned_rf, plot = 'confusion_matrix')
plot_model(tuned_rf, plot = 'learning')
plot_model(tuned_rf, plot = 'threshold')
predict_model(tuned_xgboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 0.9813 | 0.9902 | 0.9562 | 0.9667 | 0.9614 | 0.9491 | 0.9491 |
predict_model(tuned_rf);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Random Forest Classifier | 0.9698 | 0.9878 | 0.9086 | 0.9651 | 0.9360 | 0.9163 | 0.9170 |
final_xgboost = finalize_model(tuned_xgboost)
# Final model parameters for deployment
print(final_xgboost)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)
predict_model(final_xgboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
unseen_predictions = predict_model(final_xgboost, data=data_unseen)
unseen_predictions.head()
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | promotion_last_5years | department | salary | left | Label | Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | sales | low | 1 | 1 | 0.9994 |
1 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | sales | low | 1 | 1 | 1.0000 |
2 | 0.92 | 0.85 | 5 | 259 | 5 | 0 | 0 | sales | low | 1 | 1 | 1.0000 |
3 | 0.45 | 0.54 | 2 | 135 | 3 | 0 | 0 | sales | low | 1 | 1 | 1.0000 |
4 | 0.41 | 0.55 | 2 | 148 | 3 | 0 | 0 | sales | low | 1 | 1 | 1.0000 |
from pycaret.utils import check_metric
check_metric(unseen_predictions['left'], unseen_predictions['Label'], metric = 'Accuracy')
0.9896
check_metric(unseen_predictions['left'], unseen_predictions['Label'], metric = 'Recall')
0.9803
check_metric(unseen_predictions['left'], unseen_predictions['Label'], metric = 'Precision')
0.9747
check_metric(unseen_predictions['left'], unseen_predictions['Label'], metric = 'AUC')
0.9863
check_metric(unseen_predictions['left'], unseen_predictions['Label'], metric = 'F1')
0.9775
save_model(final_xgboost,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='left', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strateg... interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
Transformation Pipeline and Model Successfully Loaded
new_prediction = predict_model(load_saved_model, data=data_unseen)
#new_prediction[["Label", "Score"]].head(10)
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'employee'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | promotion_last_5years | department | salary | left | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | sales | low | 1 |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 0 | sales | medium | 1 |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | sales | medium | 1 |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | sales | low | 1 |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | sales | low | 1 |
Data for Modeling: (11249, 10) Unseen Data For Predictions: (3750, 10)
clf = setup(data = data, target = 'left', session_id=412)
Description | Value | |
---|---|---|
0 | session_id | 412 |
1 | Target | left |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (11249, 10) |
5 | Missing Values | False |
6 | Numeric Features | 3 |
7 | Categorical Features | 6 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (7874, 32) |
12 | Transformed Test Set | (3375, 32) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 3d5a |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.9856 | 0.9878 | 0.9493 | 0.9900 | 0.9692 | 0.9599 | 0.9603 | 0.3870 |
xgboost | Extreme Gradient Boosting | 0.9839 | 0.9873 | 0.9504 | 0.9813 | 0.9656 | 0.9551 | 0.9553 | 0.4660 |
lightgbm | Light Gradient Boosting Machine | 0.9832 | 0.9895 | 0.9483 | 0.9808 | 0.9642 | 0.9533 | 0.9535 | 0.0540 |
catboost | CatBoost Classifier | 0.9803 | 0.9892 | 0.9387 | 0.9779 | 0.9578 | 0.9450 | 0.9454 | 2.2120 |
et | Extra Trees Classifier | 0.9794 | 0.9868 | 0.9424 | 0.9705 | 0.9562 | 0.9427 | 0.9429 | 0.3680 |
gbc | Gradient Boosting Classifier | 0.9756 | 0.9877 | 0.9285 | 0.9679 | 0.9477 | 0.9318 | 0.9322 | 0.2920 |
dt | Decision Tree Classifier | 0.9693 | 0.9630 | 0.9509 | 0.9229 | 0.9366 | 0.9163 | 0.9166 | 0.0170 |
ada | Ada Boost Classifier | 0.9605 | 0.9823 | 0.9131 | 0.9207 | 0.9168 | 0.8909 | 0.8910 | 0.1130 |
knn | K Neighbors Classifier | 0.9305 | 0.9633 | 0.8987 | 0.8259 | 0.8605 | 0.8144 | 0.8159 | 0.0840 |
lr | Logistic Regression | 0.8952 | 0.9351 | 0.7941 | 0.7729 | 0.7832 | 0.7142 | 0.7144 | 0.4750 |
ridge | Ridge Classifier | 0.8811 | 0.0000 | 0.8053 | 0.7264 | 0.7636 | 0.6845 | 0.6863 | 0.0090 |
lda | Linear Discriminant Analysis | 0.8805 | 0.9266 | 0.8379 | 0.7121 | 0.7697 | 0.6898 | 0.6941 | 0.0210 |
dummy | Dummy Classifier | 0.7619 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0070 |
svm | SVM - Linear Kernel | 0.7528 | 0.0000 | 0.5069 | 0.4377 | 0.3716 | 0.2770 | 0.3222 | 0.0440 |
nb | Naive Bayes | 0.7056 | 0.9104 | 0.9429 | 0.4446 | 0.6042 | 0.4147 | 0.4895 | 0.0090 |
qda | Quadratic Discriminant Analysis | 0.4985 | 0.5734 | 0.7163 | 0.2913 | 0.4065 | 0.1045 | 0.1323 | 0.0130 |
top_models
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False), XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=412, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto', validate_parameters=1, verbosity=0), LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=412, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), <catboost.core.CatBoostClassifier at 0x7f0914f08110>, ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9518 | 0.9800 | 0.9415 | 0.8676 | 0.9031 | 0.8710 | 0.8723 |
1 | 0.9442 | 0.9732 | 0.9096 | 0.8636 | 0.8860 | 0.8491 | 0.8496 |
2 | 0.9404 | 0.9772 | 0.9255 | 0.8406 | 0.8810 | 0.8413 | 0.8431 |
3 | 0.9454 | 0.9753 | 0.9043 | 0.8718 | 0.8877 | 0.8517 | 0.8519 |
4 | 0.9581 | 0.9842 | 0.9358 | 0.8929 | 0.9138 | 0.8862 | 0.8866 |
5 | 0.9517 | 0.9724 | 0.9144 | 0.8860 | 0.9000 | 0.8682 | 0.8684 |
6 | 0.9454 | 0.9764 | 0.9251 | 0.8564 | 0.8895 | 0.8532 | 0.8544 |
7 | 0.9492 | 0.9738 | 0.9305 | 0.8657 | 0.8969 | 0.8632 | 0.8642 |
8 | 0.9377 | 0.9618 | 0.8877 | 0.8557 | 0.8714 | 0.8303 | 0.8306 |
9 | 0.9504 | 0.9742 | 0.9149 | 0.8821 | 0.8982 | 0.8654 | 0.8657 |
Mean | 0.9474 | 0.9748 | 0.9189 | 0.8682 | 0.8928 | 0.8580 | 0.8587 |
SD | 0.0057 | 0.0055 | 0.0152 | 0.0149 | 0.0115 | 0.0153 | 0.0152 |
tuned_top_models
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False), XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=2, missing=nan, monotone_constraints='()', n_estimators=260, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=412, reg_alpha=0.01, reg_lambda=3, scale_pos_weight=42.7, subsample=0.2, tree_method='auto', validate_parameters=1, verbosity=0), LGBMClassifier(bagging_fraction=0.6, bagging_freq=2, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=31, min_child_weight=0.001, min_split_gain=0.3, n_estimators=130, n_jobs=-1, num_leaves=60, objective=None, random_state=412, reg_alpha=0.005, reg_lambda=0.001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), <catboost.core.CatBoostClassifier at 0x7f0914a82890>, ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=6, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.005, min_impurity_split=None, min_samples_leaf=6, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9505 | 0.9798 | 0.9521 | 0.8565 | 0.9018 | 0.8688 | 0.8710 |
1 | 0.9404 | 0.9744 | 0.9096 | 0.8507 | 0.8792 | 0.8396 | 0.8405 |
2 | 0.9378 | 0.9749 | 0.9362 | 0.8263 | 0.8778 | 0.8363 | 0.8393 |
3 | 0.9416 | 0.9793 | 0.9149 | 0.8515 | 0.8821 | 0.8433 | 0.8443 |
4 | 0.9593 | 0.9848 | 0.9358 | 0.8974 | 0.9162 | 0.8894 | 0.8897 |
5 | 0.9492 | 0.9735 | 0.9144 | 0.8769 | 0.8953 | 0.8618 | 0.8621 |
6 | 0.9441 | 0.9775 | 0.9251 | 0.8522 | 0.8872 | 0.8501 | 0.8514 |
7 | 0.9517 | 0.9752 | 0.9198 | 0.8821 | 0.9005 | 0.8687 | 0.8690 |
8 | 0.9250 | 0.9709 | 0.8877 | 0.8137 | 0.8491 | 0.7994 | 0.8007 |
9 | 0.9479 | 0.9742 | 0.9149 | 0.8731 | 0.8935 | 0.8590 | 0.8595 |
Mean | 0.9448 | 0.9765 | 0.9211 | 0.8580 | 0.8883 | 0.8516 | 0.8527 |
SD | 0.0089 | 0.0038 | 0.0167 | 0.0241 | 0.0172 | 0.0231 | 0.0228 |
bagged_top_models
[BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=2, missing=nan, monotone_constraints='()', n_estimators=260, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=412, reg_alpha=0.01, reg_lambda=3, scale_pos_weight=42.7, subsample=0.2, tree_method='auto', validate_parameters=1, verbosity=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.6, bagging_freq=2, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=31, min_child_weight=0.001, min_split_gain=0.3, n_estimators=130, n_jobs=-1, num_leaves=60, objective=None, random_state=412, reg_alpha=0.005, reg_lambda=0.001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f08ecea2ed0>, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=6, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.005, min_impurity_split=None, min_samples_leaf=6, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
Best model based on AUC: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=412, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0) Best model based on Accuracy: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False) Best model based on Recall: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=2, missing=nan, monotone_constraints='()', n_estimators=260, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=412, reg_alpha=0.01, reg_lambda=3, scale_pos_weight=42.7, subsample=0.2, tree_method='auto', validate_parameters=1, verbosity=0) Best model based on Precision: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False) Best model based on F1: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)
plot_model(best2, plot = 'auc')
plot_model(best2, plot = 'confusion_matrix')
plot_model(best2, plot = 'learning')
save_model(best2,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='left', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strateg... RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
Transformation Pipeline and Model Successfully Loaded
Label | Score | |
---|---|---|
0 | 1 | 1.00 |
1 | 1 | 1.00 |
2 | 1 | 1.00 |
3 | 1 | 0.97 |
4 | 1 | 1.00 |
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.