For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'diabetes'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
dataset.shape
(768, 9)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (576, 9) Unseen Data For Predictions: (192, 9)
env_setup = setup(data = data, target = 'Class variable', session_id=412)
Description | Value | |
---|---|---|
0 | session_id | 412 |
1 | Target | Class variable |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (576, 9) |
5 | Missing Values | False |
6 | Numeric Features | 7 |
7 | Categorical Features | 1 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (403, 23) |
12 | Transformed Test Set | (173, 23) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | f6ff |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
best_model = compare_models()
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lr | Logistic Regression | 0.7768 | 0.8229 | 0.5901 | 0.7170 | 0.6383 | 0.4813 | 0.4918 | 0.3080 |
catboost | CatBoost Classifier | 0.7743 | 0.8624 | 0.6253 | 0.6887 | 0.6524 | 0.4867 | 0.4900 | 0.8420 |
ridge | Ridge Classifier | 0.7742 | 0.0000 | 0.5824 | 0.7175 | 0.6344 | 0.4751 | 0.4864 | 0.0060 |
lda | Linear Discriminant Analysis | 0.7717 | 0.8130 | 0.5753 | 0.7142 | 0.6282 | 0.4681 | 0.4798 | 0.0060 |
rf | Random Forest Classifier | 0.7668 | 0.8399 | 0.5390 | 0.7154 | 0.6104 | 0.4500 | 0.4617 | 0.2650 |
et | Extra Trees Classifier | 0.7641 | 0.8169 | 0.5308 | 0.7039 | 0.6019 | 0.4408 | 0.4510 | 0.2630 |
xgboost | Extreme Gradient Boosting | 0.7568 | 0.8148 | 0.6033 | 0.6616 | 0.6256 | 0.4474 | 0.4521 | 0.0860 |
knn | K Neighbors Classifier | 0.7544 | 0.8033 | 0.5962 | 0.6602 | 0.6201 | 0.4406 | 0.4466 | 0.0670 |
gbc | Gradient Boosting Classifier | 0.7520 | 0.8277 | 0.5890 | 0.6576 | 0.6194 | 0.4367 | 0.4396 | 0.0460 |
lightgbm | Light Gradient Boosting Machine | 0.7516 | 0.8279 | 0.5967 | 0.6551 | 0.6191 | 0.4368 | 0.4415 | 0.0160 |
ada | Ada Boost Classifier | 0.7396 | 0.7920 | 0.5604 | 0.6367 | 0.5912 | 0.4028 | 0.4080 | 0.0420 |
dt | Decision Tree Classifier | 0.7171 | 0.6906 | 0.6044 | 0.5984 | 0.5944 | 0.3797 | 0.3846 | 0.0060 |
nb | Naive Bayes | 0.6698 | 0.7454 | 0.2225 | 0.5583 | 0.3126 | 0.1484 | 0.1768 | 0.0060 |
dummy | Dummy Classifier | 0.6551 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0040 |
svm | SVM - Linear Kernel | 0.6374 | 0.0000 | 0.3824 | 0.4822 | 0.3683 | 0.1532 | 0.1773 | 0.0060 |
qda | Quadratic Discriminant Analysis | 0.5701 | 0.6264 | 0.5857 | 0.3600 | 0.4278 | 0.1410 | 0.1375 | 0.0090 |
catboost = create_model('catboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8049 | 0.9101 | 0.6429 | 0.7500 | 0.6923 | 0.5507 | 0.5542 |
1 | 0.7073 | 0.7513 | 0.5000 | 0.5833 | 0.5385 | 0.3260 | 0.3281 |
2 | 0.7561 | 0.8545 | 0.6429 | 0.6429 | 0.6429 | 0.4577 | 0.4577 |
3 | 0.7500 | 0.8034 | 0.5385 | 0.6364 | 0.5833 | 0.4065 | 0.4094 |
4 | 0.7750 | 0.8846 | 0.7143 | 0.6667 | 0.6897 | 0.5135 | 0.5143 |
5 | 0.8500 | 0.9148 | 0.7143 | 0.8333 | 0.7692 | 0.6591 | 0.6634 |
6 | 0.7250 | 0.8929 | 0.5714 | 0.6154 | 0.5926 | 0.3855 | 0.3861 |
7 | 0.8750 | 0.9038 | 0.8571 | 0.8000 | 0.8276 | 0.7297 | 0.7308 |
8 | 0.7250 | 0.8626 | 0.4286 | 0.6667 | 0.5217 | 0.3413 | 0.3577 |
9 | 0.7750 | 0.8462 | 0.6429 | 0.6923 | 0.6667 | 0.4972 | 0.4980 |
Mean | 0.7743 | 0.8624 | 0.6253 | 0.6887 | 0.6524 | 0.4867 | 0.4900 |
SD | 0.0520 | 0.0493 | 0.1163 | 0.0770 | 0.0927 | 0.1258 | 0.1244 |
tuned_catboost = tune_model(catboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8293 | 0.9048 | 0.6429 | 0.8182 | 0.7200 | 0.5997 | 0.6087 |
1 | 0.6341 | 0.6984 | 0.3571 | 0.4545 | 0.4000 | 0.1423 | 0.1444 |
2 | 0.7317 | 0.8280 | 0.6429 | 0.6000 | 0.6207 | 0.4135 | 0.4141 |
3 | 0.7500 | 0.7721 | 0.6154 | 0.6154 | 0.6154 | 0.4302 | 0.4302 |
4 | 0.8000 | 0.8984 | 0.7143 | 0.7143 | 0.7143 | 0.5604 | 0.5604 |
5 | 0.8500 | 0.8681 | 0.8571 | 0.7500 | 0.8000 | 0.6809 | 0.6847 |
6 | 0.7500 | 0.8874 | 0.5714 | 0.6667 | 0.6154 | 0.4318 | 0.4346 |
7 | 0.8000 | 0.8736 | 0.6429 | 0.7500 | 0.6923 | 0.5455 | 0.5490 |
8 | 0.7750 | 0.8242 | 0.5000 | 0.7778 | 0.6087 | 0.4611 | 0.4832 |
9 | 0.7750 | 0.8242 | 0.7143 | 0.6667 | 0.6897 | 0.5135 | 0.5143 |
Mean | 0.7695 | 0.8379 | 0.6258 | 0.6814 | 0.6476 | 0.4779 | 0.4824 |
SD | 0.0570 | 0.0608 | 0.1269 | 0.1007 | 0.1011 | 0.1379 | 0.1387 |
print(tuned_catboost)
<catboost.core.CatBoostClassifier object at 0x7fd8c613fa90>
plot_model(tuned_catboost, plot = 'auc')
plot_model(tuned_catboost, plot = 'pr')
plot_model(tuned_catboost, plot='feature')
plot_model(tuned_catboost, plot = 'confusion_matrix')
plot_model(tuned_catboost, plot = 'learning')
plot_model(tuned_catboost, plot = 'threshold')
lr = create_model('lr')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8049 | 0.8598 | 0.5000 | 0.8750 | 0.6364 | 0.5162 | 0.5540 |
1 | 0.6829 | 0.6640 | 0.3571 | 0.5556 | 0.4348 | 0.2287 | 0.2394 |
2 | 0.8049 | 0.8757 | 0.5714 | 0.8000 | 0.6667 | 0.5341 | 0.5492 |
3 | 0.8000 | 0.8291 | 0.6154 | 0.7273 | 0.6667 | 0.5252 | 0.5290 |
4 | 0.7750 | 0.8462 | 0.7143 | 0.6667 | 0.6897 | 0.5135 | 0.5143 |
5 | 0.8250 | 0.8187 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
6 | 0.7250 | 0.8324 | 0.5000 | 0.6364 | 0.5600 | 0.3642 | 0.3698 |
7 | 0.8250 | 0.8544 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
8 | 0.7500 | 0.8242 | 0.4286 | 0.7500 | 0.5455 | 0.3902 | 0.4193 |
9 | 0.7750 | 0.8242 | 0.6429 | 0.6923 | 0.6667 | 0.4972 | 0.4980 |
Mean | 0.7768 | 0.8229 | 0.5901 | 0.7170 | 0.6383 | 0.4813 | 0.4918 |
SD | 0.0436 | 0.0557 | 0.1387 | 0.0832 | 0.0950 | 0.1151 | 0.1127 |
tuned_lr = tune_model(lr)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8049 | 0.8492 | 0.5000 | 0.8750 | 0.6364 | 0.5162 | 0.5540 |
1 | 0.6829 | 0.6772 | 0.3571 | 0.5556 | 0.4348 | 0.2287 | 0.2394 |
2 | 0.8049 | 0.8757 | 0.5714 | 0.8000 | 0.6667 | 0.5341 | 0.5492 |
3 | 0.8000 | 0.8319 | 0.6154 | 0.7273 | 0.6667 | 0.5252 | 0.5290 |
4 | 0.7750 | 0.8489 | 0.7143 | 0.6667 | 0.6897 | 0.5135 | 0.5143 |
5 | 0.8000 | 0.8132 | 0.7857 | 0.6875 | 0.7333 | 0.5745 | 0.5777 |
6 | 0.7500 | 0.8352 | 0.5000 | 0.7000 | 0.5833 | 0.4118 | 0.4237 |
7 | 0.8250 | 0.8516 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
8 | 0.7500 | 0.8159 | 0.4286 | 0.7500 | 0.5455 | 0.3902 | 0.4193 |
9 | 0.7750 | 0.8214 | 0.6429 | 0.6923 | 0.6667 | 0.4972 | 0.4980 |
Mean | 0.7768 | 0.8220 | 0.5901 | 0.7188 | 0.6382 | 0.4813 | 0.4927 |
SD | 0.0390 | 0.0515 | 0.1387 | 0.0798 | 0.0903 | 0.1061 | 0.1035 |
print(tuned_lr)
LogisticRegression(C=1.908, class_weight={}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
plot_model(tuned_lr, plot = 'auc')
plot_model(tuned_lr, plot = 'pr')
plot_model(tuned_lr, plot = 'feature')
plot_model(tuned_lr, plot = 'confusion_matrix')
plot_model(tuned_lr, plot = 'learning')
plot_model(tuned_lr, plot = 'threshold')
predict_model(tuned_catboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | CatBoost Classifier | 0.7688 | 0.7692 | 0.5455 | 0.6667 | 0.6000 | 0.4397 | 0.4441 |
predict_model(tuned_lr);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | 0.7688 | 0.8294 | 0.4545 | 0.7143 | 0.5556 | 0.4096 | 0.4287 |
final_lr = finalize_model(tuned_lr)
# Final model parameters for deployment
print(final_lr)
LogisticRegression(C=1.908, class_weight={}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
predict_model(final_lr);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | 0.8208 | 0.8565 | 0.5818 | 0.8000 | 0.6737 | 0.5544 | 0.5677 |
unseen_predictions = predict_model(final_lr, data=data_unseen)
unseen_predictions.head()
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | Label | Score | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 1 | 0.8475 |
1 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 1 | 0.8887 |
2 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 | 0 | 0.8673 |
3 | 10 | 168 | 74 | 0 | 0 | 38.0 | 0.537 | 34 | 1 | 1 | 0.9040 |
4 | 10 | 139 | 80 | 0 | 0 | 27.1 | 1.441 | 57 | 0 | 1 | 0.7729 |
from pycaret.utils import check_metric
check_metric(unseen_predictions['Class variable'], unseen_predictions['Label'], metric = 'Accuracy')
0.7083
check_metric(unseen_predictions['Class variable'], unseen_predictions['Label'], metric = 'Recall')
0.5
check_metric(unseen_predictions['Class variable'], unseen_predictions['Label'], metric = 'Precision')
0.6607
check_metric(unseen_predictions['Class variable'], unseen_predictions['Label'], metric = 'AUC')
0.6695
check_metric(unseen_predictions['Class variable'], unseen_predictions['Label'], metric = 'F1')
0.5692
save_model(final_lr,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='Class variable', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeri... ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', LogisticRegression(C=1.908, class_weight={}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
Transformation Pipeline and Model Successfully Loaded
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head(10)
Label | Score | |
---|---|---|
0 | 1 | 0.8475 |
1 | 1 | 0.8887 |
2 | 0 | 0.8673 |
3 | 1 | 0.9040 |
4 | 1 | 0.7729 |
5 | 0 | 0.7984 |
6 | 0 | 0.7777 |
7 | 1 | 0.5320 |
8 | 1 | 0.5829 |
9 | 0 | 0.5471 |
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'diabetes'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
Data for Modeling: (576, 9) Unseen Data For Predictions: (192, 9)
clf = setup(data = data, target = 'Class variable', session_id=412)
Description | Value | |
---|---|---|
0 | session_id | 412 |
1 | Target | Class variable |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (576, 9) |
5 | Missing Values | False |
6 | Numeric Features | 7 |
7 | Categorical Features | 1 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (403, 23) |
12 | Transformed Test Set | (173, 23) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | b219 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lr | Logistic Regression | 0.7768 | 0.8229 | 0.5901 | 0.7170 | 0.6383 | 0.4813 | 0.4918 | 0.3100 |
catboost | CatBoost Classifier | 0.7743 | 0.8624 | 0.6253 | 0.6887 | 0.6524 | 0.4867 | 0.4900 | 0.8770 |
ridge | Ridge Classifier | 0.7742 | 0.0000 | 0.5824 | 0.7175 | 0.6344 | 0.4751 | 0.4864 | 0.0060 |
lda | Linear Discriminant Analysis | 0.7717 | 0.8130 | 0.5753 | 0.7142 | 0.6282 | 0.4681 | 0.4798 | 0.0060 |
rf | Random Forest Classifier | 0.7668 | 0.8399 | 0.5390 | 0.7154 | 0.6104 | 0.4500 | 0.4617 | 0.2640 |
et | Extra Trees Classifier | 0.7641 | 0.8169 | 0.5308 | 0.7039 | 0.6019 | 0.4408 | 0.4510 | 0.2660 |
xgboost | Extreme Gradient Boosting | 0.7568 | 0.8148 | 0.6033 | 0.6616 | 0.6256 | 0.4474 | 0.4521 | 0.0870 |
knn | K Neighbors Classifier | 0.7544 | 0.8033 | 0.5962 | 0.6602 | 0.6201 | 0.4406 | 0.4466 | 0.0670 |
gbc | Gradient Boosting Classifier | 0.7520 | 0.8277 | 0.5890 | 0.6576 | 0.6194 | 0.4367 | 0.4396 | 0.0450 |
lightgbm | Light Gradient Boosting Machine | 0.7516 | 0.8279 | 0.5967 | 0.6551 | 0.6191 | 0.4368 | 0.4415 | 0.0160 |
ada | Ada Boost Classifier | 0.7396 | 0.7920 | 0.5604 | 0.6367 | 0.5912 | 0.4028 | 0.4080 | 0.0420 |
dt | Decision Tree Classifier | 0.7171 | 0.6906 | 0.6044 | 0.5984 | 0.5944 | 0.3797 | 0.3846 | 0.0060 |
nb | Naive Bayes | 0.6698 | 0.7454 | 0.2225 | 0.5583 | 0.3126 | 0.1484 | 0.1768 | 0.0060 |
dummy | Dummy Classifier | 0.6551 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0040 |
svm | SVM - Linear Kernel | 0.6374 | 0.0000 | 0.3824 | 0.4822 | 0.3683 | 0.1532 | 0.1773 | 0.0060 |
qda | Quadratic Discriminant Analysis | 0.5701 | 0.6264 | 0.5857 | 0.3600 | 0.4278 | 0.1410 | 0.1375 | 0.0070 |
top_models
[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), <catboost.core.CatBoostClassifier at 0x7f5c6e10de10>, RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=412, solver='auto', tol=0.001), LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8293 | 0.8915 | 0.7857 | 0.7333 | 0.7586 | 0.6268 | 0.6277 |
1 | 0.7317 | 0.7619 | 0.5714 | 0.6154 | 0.5926 | 0.3930 | 0.3936 |
2 | 0.8049 | 0.8571 | 0.8571 | 0.6667 | 0.7500 | 0.5941 | 0.6067 |
3 | 0.6750 | 0.7350 | 0.6154 | 0.5000 | 0.5517 | 0.3011 | 0.3051 |
4 | 0.8000 | 0.8709 | 0.7857 | 0.6875 | 0.7333 | 0.5745 | 0.5777 |
5 | 0.8500 | 0.9066 | 0.8571 | 0.7500 | 0.8000 | 0.6809 | 0.6847 |
6 | 0.8250 | 0.9038 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
7 | 0.8000 | 0.8709 | 0.9286 | 0.6500 | 0.7647 | 0.6000 | 0.6290 |
8 | 0.7500 | 0.8434 | 0.6429 | 0.6429 | 0.6429 | 0.4505 | 0.4505 |
9 | 0.8000 | 0.8681 | 0.7143 | 0.7143 | 0.7143 | 0.5604 | 0.5604 |
Mean | 0.7866 | 0.8509 | 0.7544 | 0.6693 | 0.7067 | 0.5403 | 0.5458 |
SD | 0.0500 | 0.0549 | 0.1101 | 0.0705 | 0.0782 | 0.1135 | 0.1157 |
tuned_top_models
[LogisticRegression(C=1.908, class_weight={}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), <catboost.core.CatBoostClassifier at 0x7f5c6c0fd350>, RidgeClassifier(alpha=6.49, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=412, solver='auto', tol=0.001), LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=0.0005, solver='lsqr', store_covariance=False, tol=0.0001), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8537 | 0.9153 | 0.7857 | 0.7857 | 0.7857 | 0.6746 | 0.6746 |
1 | 0.6585 | 0.7407 | 0.5000 | 0.5000 | 0.5000 | 0.2407 | 0.2407 |
2 | 0.8049 | 0.8571 | 0.8571 | 0.6667 | 0.7500 | 0.5941 | 0.6067 |
3 | 0.7000 | 0.7493 | 0.6923 | 0.5294 | 0.6000 | 0.3668 | 0.3752 |
4 | 0.7750 | 0.8599 | 0.7143 | 0.6667 | 0.6897 | 0.5135 | 0.5143 |
5 | 0.8000 | 0.9038 | 0.7857 | 0.6875 | 0.7333 | 0.5745 | 0.5777 |
6 | 0.8250 | 0.8901 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
7 | 0.7500 | 0.8626 | 0.8571 | 0.6000 | 0.7059 | 0.5000 | 0.5241 |
8 | 0.7250 | 0.8544 | 0.5714 | 0.6154 | 0.5926 | 0.3855 | 0.3861 |
9 | 0.8250 | 0.8846 | 0.7857 | 0.7333 | 0.7586 | 0.6216 | 0.6225 |
Mean | 0.7717 | 0.8518 | 0.7335 | 0.6518 | 0.6874 | 0.5093 | 0.5145 |
SD | 0.0591 | 0.0569 | 0.1115 | 0.0866 | 0.0883 | 0.1311 | 0.1313 |
bagged_top_models
[BaggingClassifier(base_estimator=LogisticRegression(C=1.908, class_weight={}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=412, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f5c6c0d6310>, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=RidgeClassifier(alpha=6.49, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=412, solver='auto', tol=0.001), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=0.0005, solver='lsqr', store_covariance=False, tol=0.0001), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False), BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False)]
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
Best model based on AUC: <catboost.core.CatBoostClassifier object at 0x7f5c6ea0a250> Best model based on Accuracy: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False) Best model based on Recall: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False) Best model based on Precision: BaggingClassifier(base_estimator=RidgeClassifier(alpha=6.49, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=412, solver='auto', tol=0.001), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=412, verbose=0, warm_start=False) Best model based on F1: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=8, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=5, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=-1, oob_score=False, random_state=412, verbose=0, warm_start=False)
plot_model(best1, plot = 'auc')
plot_model(best1, plot = 'confusion_matrix')
plot_model(best1, plot = 'learning')
save_model(best1,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='Class variable', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeri... ('cluster_all', 'passthrough'), ('dummy', Dummify(target='Class variable')), ('fix_perfect', Remove_100(target='Class variable')), ('clean_names', Clean_Colum_Names()), ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', <catboost.core.CatBoostClassifier object at 0x7f5c6ea0a250>]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
Transformation Pipeline and Model Successfully Loaded
Label | Score | |
---|---|---|
0 | 1 | 0.7370 |
1 | 1 | 0.6251 |
2 | 0 | 0.8190 |
3 | 1 | 0.9534 |
4 | 0 | 0.7222 |
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.