For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'employee'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | promotion_last_5years | department | salary | left | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | sales | low | 1 |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 0 | sales | medium | 1 |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | sales | medium | 1 |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | sales | low | 1 |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | sales | low | 1 |
dataset.shape
(14999, 10)
dataset.columns.to_list()
['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'department', 'salary', 'left']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (11249, 10) Unseen Data For Predictions: (3750, 10)
env_setup = setup(data = data, target = 'left', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | left |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (11249, 10) |
5 | Missing Values | False |
6 | Numeric Features | 3 |
7 | Categorical Features | 6 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (7874, 32) |
12 | Transformed Test Set | (3375, 32) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 9cc7 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.9864 | 0.9890 | 0.9528 | 0.9901 | 0.9711 | 0.9622 | 0.9626 | 0.4150 |
lightgbm | Light Gradient Boosting Machine | 0.9836 | 0.9914 | 0.9528 | 0.9783 | 0.9654 | 0.9546 | 0.9548 | 0.0610 |
xgboost | Extreme Gradient Boosting | 0.9831 | 0.9901 | 0.9518 | 0.9774 | 0.9643 | 0.9532 | 0.9535 | 0.5000 |
catboost | CatBoost Classifier | 0.9818 | 0.9904 | 0.9417 | 0.9820 | 0.9613 | 0.9495 | 0.9499 | 2.2450 |
et | Extra Trees Classifier | 0.9802 | 0.9869 | 0.9454 | 0.9713 | 0.9581 | 0.9451 | 0.9454 | 0.4100 |
gbc | Gradient Boosting Classifier | 0.9751 | 0.9863 | 0.9232 | 0.9717 | 0.9467 | 0.9305 | 0.9311 | 0.3200 |
dt | Decision Tree Classifier | 0.9742 | 0.9682 | 0.9565 | 0.9373 | 0.9468 | 0.9298 | 0.9299 | 0.0190 |
ada | Ada Boost Classifier | 0.9589 | 0.9803 | 0.9057 | 0.9214 | 0.9134 | 0.8864 | 0.8865 | 0.1290 |
knn | K Neighbors Classifier | 0.9314 | 0.9646 | 0.8945 | 0.8321 | 0.8620 | 0.8165 | 0.8176 | 0.0880 |
lr | Logistic Regression | 0.8945 | 0.9349 | 0.7949 | 0.7720 | 0.7829 | 0.7133 | 0.7137 | 0.6080 |
ridge | Ridge Classifier | 0.8778 | 0.0000 | 0.7901 | 0.7259 | 0.7560 | 0.6748 | 0.6764 | 0.0110 |
lda | Linear Discriminant Analysis | 0.8778 | 0.9272 | 0.8341 | 0.7090 | 0.7660 | 0.6841 | 0.6887 | 0.0270 |
svm | SVM - Linear Kernel | 0.7806 | 0.0000 | 0.7394 | 0.5726 | 0.5833 | 0.4683 | 0.5077 | 0.0660 |
dummy | Dummy Classifier | 0.7604 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0080 |
nb | Naive Bayes | 0.7136 | 0.9135 | 0.9587 | 0.4543 | 0.6163 | 0.4313 | 0.5083 | 0.0120 |
qda | Quadratic Discriminant Analysis | 0.5244 | 0.5949 | 0.7304 | 0.3243 | 0.4244 | 0.1405 | 0.1826 | 0.0150 |
xgboost = create_model('xgboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9873 | 0.9934 | 0.9735 | 0.9735 | 0.9735 | 0.9652 | 0.9652 |
1 | 0.9810 | 0.9847 | 0.9524 | 0.9677 | 0.9600 | 0.9475 | 0.9476 |
2 | 0.9708 | 0.9893 | 0.9259 | 0.9511 | 0.9383 | 0.9192 | 0.9194 |
3 | 0.9860 | 0.9940 | 0.9471 | 0.9944 | 0.9702 | 0.9611 | 0.9616 |
4 | 0.9822 | 0.9880 | 0.9309 | 0.9943 | 0.9615 | 0.9500 | 0.9509 |
5 | 0.9797 | 0.9864 | 0.9521 | 0.9624 | 0.9572 | 0.9439 | 0.9439 |
6 | 0.9873 | 0.9937 | 0.9628 | 0.9837 | 0.9731 | 0.9648 | 0.9649 |
7 | 0.9860 | 0.9876 | 0.9683 | 0.9734 | 0.9708 | 0.9616 | 0.9616 |
8 | 0.9848 | 0.9924 | 0.9418 | 0.9944 | 0.9674 | 0.9575 | 0.9581 |
9 | 0.9860 | 0.9917 | 0.9630 | 0.9785 | 0.9707 | 0.9615 | 0.9615 |
Mean | 0.9831 | 0.9901 | 0.9518 | 0.9774 | 0.9643 | 0.9532 | 0.9535 |
SD | 0.0048 | 0.0032 | 0.0149 | 0.0140 | 0.0102 | 0.0134 | 0.0134 |
tuned_xgboost = tune_model(xgboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9873 | 0.9958 | 0.9735 | 0.9735 | 0.9735 | 0.9652 | 0.9652 |
1 | 0.9835 | 0.9836 | 0.9577 | 0.9731 | 0.9653 | 0.9545 | 0.9546 |
2 | 0.9721 | 0.9868 | 0.9365 | 0.9465 | 0.9415 | 0.9232 | 0.9232 |
3 | 0.9822 | 0.9902 | 0.9577 | 0.9679 | 0.9628 | 0.9511 | 0.9511 |
4 | 0.9822 | 0.9849 | 0.9415 | 0.9833 | 0.9620 | 0.9504 | 0.9507 |
5 | 0.9822 | 0.9827 | 0.9628 | 0.9628 | 0.9628 | 0.9511 | 0.9511 |
6 | 0.9771 | 0.9905 | 0.9681 | 0.9381 | 0.9529 | 0.9378 | 0.9380 |
7 | 0.9822 | 0.9828 | 0.9683 | 0.9581 | 0.9632 | 0.9514 | 0.9515 |
8 | 0.9797 | 0.9885 | 0.9577 | 0.9577 | 0.9577 | 0.9443 | 0.9443 |
9 | 0.9822 | 0.9889 | 0.9524 | 0.9730 | 0.9626 | 0.9509 | 0.9510 |
Mean | 0.9811 | 0.9875 | 0.9576 | 0.9634 | 0.9604 | 0.9480 | 0.9481 |
SD | 0.0039 | 0.0040 | 0.0111 | 0.0130 | 0.0080 | 0.0106 | 0.0106 |
print(tuned_xgboost)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)
plot_model(tuned_xgboost, plot = 'auc')
plot_model(tuned_xgboost, plot = 'pr')
plot_model(tuned_xgboost, plot='feature')
plot_model(tuned_xgboost, plot = 'confusion_matrix')
plot_model(tuned_xgboost, plot = 'learning')
plot_model(tuned_xgboost, plot = 'threshold')
rf = create_model('rf')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9898 | 0.9955 | 0.9683 | 0.9892 | 0.9786 | 0.9720 | 0.9721 |
1 | 0.9848 | 0.9856 | 0.9471 | 0.9890 | 0.9676 | 0.9576 | 0.9580 |
2 | 0.9746 | 0.9837 | 0.9312 | 0.9617 | 0.9462 | 0.9296 | 0.9298 |
3 | 0.9873 | 0.9883 | 0.9524 | 0.9945 | 0.9730 | 0.9647 | 0.9651 |
4 | 0.9860 | 0.9901 | 0.9415 | 1.0000 | 0.9699 | 0.9608 | 0.9615 |
5 | 0.9873 | 0.9900 | 0.9574 | 0.9890 | 0.9730 | 0.9647 | 0.9649 |
6 | 0.9898 | 0.9868 | 0.9681 | 0.9891 | 0.9785 | 0.9718 | 0.9719 |
7 | 0.9924 | 0.9906 | 0.9683 | 1.0000 | 0.9839 | 0.9789 | 0.9791 |
8 | 0.9835 | 0.9885 | 0.9418 | 0.9889 | 0.9648 | 0.9540 | 0.9545 |
9 | 0.9886 | 0.9913 | 0.9524 | 1.0000 | 0.9756 | 0.9681 | 0.9686 |
Mean | 0.9864 | 0.9890 | 0.9528 | 0.9901 | 0.9711 | 0.9622 | 0.9626 |
SD | 0.0046 | 0.0031 | 0.0122 | 0.0106 | 0.0099 | 0.0129 | 0.0129 |
tuned_rf = tune_model(rf)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9708 | 0.9931 | 0.9153 | 0.9611 | 0.9377 | 0.9186 | 0.9191 |
1 | 0.9759 | 0.9860 | 0.9418 | 0.9570 | 0.9493 | 0.9335 | 0.9336 |
2 | 0.9607 | 0.9807 | 0.8783 | 0.9540 | 0.9146 | 0.8891 | 0.8904 |
3 | 0.9721 | 0.9873 | 0.9101 | 0.9718 | 0.9399 | 0.9217 | 0.9226 |
4 | 0.9746 | 0.9825 | 0.8989 | 0.9941 | 0.9441 | 0.9277 | 0.9297 |
5 | 0.9720 | 0.9870 | 0.9202 | 0.9611 | 0.9402 | 0.9220 | 0.9224 |
6 | 0.9822 | 0.9886 | 0.9468 | 0.9780 | 0.9622 | 0.9505 | 0.9508 |
7 | 0.9822 | 0.9877 | 0.9524 | 0.9730 | 0.9626 | 0.9509 | 0.9510 |
8 | 0.9733 | 0.9875 | 0.9048 | 0.9828 | 0.9421 | 0.9248 | 0.9262 |
9 | 0.9771 | 0.9905 | 0.9206 | 0.9831 | 0.9508 | 0.9359 | 0.9368 |
Mean | 0.9741 | 0.9871 | 0.9189 | 0.9716 | 0.9444 | 0.9275 | 0.9282 |
SD | 0.0059 | 0.0034 | 0.0218 | 0.0124 | 0.0130 | 0.0168 | 0.0165 |
print(tuned_rf)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(tuned_rf, plot = 'auc')
plot_model(tuned_rf, plot = 'pr')
plot_model(tuned_rf, plot = 'feature')
plot_model(tuned_rf, plot = 'confusion_matrix')
plot_model(tuned_rf, plot = 'learning')