For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'glass'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | Type | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0.0 | 0.0 | 1 |
1 | 1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0.0 | 0.0 | 1 |
2 | 1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0.0 | 0.0 | 1 |
3 | 1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0.0 | 0.0 | 1 |
4 | 1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0.0 | 0.0 | 1 |
dataset.shape
(214, 10)
dataset.columns.to_list()
['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (160, 10) Unseen Data For Predictions: (54, 10)
env_setup = setup(data = data, target = 'Type', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | Type |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (160, 10) |
5 | Missing Values | False |
6 | Numeric Features | 9 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (111, 9) |
12 | Transformed Test Set | (49, 9) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 55bc |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
et | Extra Trees Classifier | 0.7848 | 0.1819 | 0.7472 | 0.7585 | 0.7616 | 0.7080 | 0.7204 | 0.2570 |
catboost | CatBoost Classifier | 0.7826 | 0.1966 | 0.6767 | 0.7520 | 0.7542 | 0.7028 | 0.7187 | 1.7660 |
rf | Random Forest Classifier | 0.7568 | 0.1886 | 0.6750 | 0.7342 | 0.7332 | 0.6677 | 0.6819 | 0.2660 |
xgboost | Extreme Gradient Boosting | 0.7553 | 0.1927 | 0.6872 | 0.7523 | 0.7354 | 0.6702 | 0.6895 | 0.0860 |
lightgbm | Light Gradient Boosting Machine | 0.7303 | 0.1873 | 0.6033 | 0.6778 | 0.6889 | 0.6312 | 0.6490 | 0.0250 |
gbc | Gradient Boosting Classifier | 0.7129 | 0.1781 | 0.6600 | 0.6861 | 0.6837 | 0.6117 | 0.6290 | 0.1890 |
dt | Decision Tree Classifier | 0.6492 | 0.1589 | 0.5803 | 0.6670 | 0.6266 | 0.5324 | 0.5504 | 0.0060 |
knn | K Neighbors Classifier | 0.6318 | 0.1579 | 0.4714 | 0.5902 | 0.5845 | 0.4873 | 0.5144 | 0.0670 |
lda | Linear Discriminant Analysis | 0.6220 | 0.1758 | 0.5614 | 0.5776 | 0.5778 | 0.4771 | 0.4959 | 0.0070 |
lr | Logistic Regression | 0.5955 | 0.1601 | 0.5064 | 0.5348 | 0.5517 | 0.4383 | 0.4525 | 0.3210 |
ridge | Ridge Classifier | 0.5955 | 0.0000 | 0.5031 | 0.5118 | 0.5263 | 0.4351 | 0.4659 | 0.0140 |
nb | Naive Bayes | 0.4962 | 0.1670 | 0.5297 | 0.5860 | 0.4902 | 0.3673 | 0.3970 | 0.0060 |
svm | SVM - Linear Kernel | 0.3788 | 0.0000 | 0.2833 | 0.1902 | 0.2418 | 0.1382 | 0.1928 | 0.0360 |
ada | Ada Boost Classifier | 0.3697 | 0.1271 | 0.3150 | 0.1987 | 0.2335 | 0.1556 | 0.2334 | 0.0380 |
dummy | Dummy Classifier | 0.2879 | 0.1000 | 0.1933 | 0.0838 | 0.1296 | 0.0000 | 0.0000 | 0.0050 |
qda | Quadratic Discriminant Analysis | 0.0720 | 0.0000 | 0.1533 | 0.0065 | 0.0119 | 0.0000 | 0.0000 | 0.0070 |
et = create_model('et')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.6667 | 0.8778 | 0.7083 | 0.6306 | 0.6349 | 0.5556 | 0.5661 |
1 | 0.7273 | 0.0000 | 0.5500 | 0.6667 | 0.6818 | 0.6163 | 0.6390 |
2 | 0.9091 | 0.0000 | 0.9333 | 0.9273 | 0.9051 | 0.8764 | 0.8866 |
3 | 0.7273 | 0.0000 | 0.7000 | 0.6788 | 0.6851 | 0.6292 | 0.6519 |
4 | 0.7273 | 0.0000 | 0.7000 | 0.6788 | 0.6851 | 0.6292 | 0.6519 |
5 | 0.9091 | 0.0000 | 0.8000 | 0.8485 | 0.8727 | 0.8764 | 0.8866 |
6 | 0.8182 | 0.0000 | 0.8833 | 0.8182 | 0.8182 | 0.7556 | 0.7556 |
7 | 0.7273 | 0.0000 | 0.6833 | 0.6818 | 0.7013 | 0.6250 | 0.6326 |
8 | 0.8182 | 0.0000 | 0.6111 | 0.8364 | 0.8141 | 0.7556 | 0.7729 |
9 | 0.8182 | 0.9416 | 0.9028 | 0.8182 | 0.8182 | 0.7609 | 0.7609 |
Mean | 0.7848 | 0.1819 | 0.7472 | 0.7585 | 0.7616 | 0.7080 | 0.7204 |
SD | 0.0784 | 0.3642 | 0.1215 | 0.0965 | 0.0894 | 0.1072 | 0.1041 |
tuned_et = tune_model(et)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7500 | 0.8570 | 0.6250 | 0.6333 | 0.6852 | 0.6471 | 0.6621 |
1 | 0.7273 | 0.0000 | 0.5500 | 0.6848 | 0.6804 | 0.6163 | 0.6471 |
2 | 0.7273 | 0.0000 | 0.5694 | 0.7273 | 0.7273 | 0.6374 | 0.6444 |
3 | 0.7273 | 0.0000 | 0.7000 | 0.6788 | 0.6851 | 0.6292 | 0.6519 |
4 | 0.6364 | 0.0000 | 0.5000 | 0.5273 | 0.5682 | 0.4884 | 0.5078 |
5 | 0.9091 | 0.0000 | 0.8000 | 0.8485 | 0.8727 | 0.8764 | 0.8866 |
6 | 0.8182 | 0.0000 | 0.8833 | 0.8182 | 0.8182 | 0.7556 | 0.7556 |
7 | 0.7273 | 0.0000 | 0.6667 | 0.6515 | 0.6727 | 0.6163 | 0.6408 |
8 | 0.7273 | 0.0000 | 0.4444 | 0.6545 | 0.6869 | 0.6250 | 0.6402 |
9 | 0.6364 | 0.9682 | 0.4444 | 0.4727 | 0.5397 | 0.4943 | 0.5142 |
Mean | 0.7386 | 0.1825 | 0.6183 | 0.6697 | 0.6936 | 0.6386 | 0.6551 |
SD | 0.0755 | 0.3659 | 0.1392 | 0.1089 | 0.0943 | 0.1072 | 0.1029 |
print(tuned_et)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=9, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0005, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
#plot_model(tuned_et, plot = 'auc')
plot_model(tuned_et, plot = 'pr')
plot_model(tuned_et, plot='feature')
plot_model(tuned_et, plot = 'confusion_matrix')
plot_model(tuned_et, plot = 'learning')
#plot_model(tuned_et, plot = 'threshold')
plot_model(tuned_et, plot = 'boundary')
rf = create_model('rf')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7500 | 0.9091 | 0.7500 | 0.7000 | 0.7222 | 0.6571 | 0.6640 |
1 | 0.7273 | 0.0000 | 0.4583 | 0.7000 | 0.6955 | 0.6207 | 0.6430 |
2 | 0.8182 | 0.0000 | 0.7361 | 0.9091 | 0.8545 | 0.7634 | 0.7719 |
3 | 0.7273 | 0.0000 | 0.7000 | 0.6788 | 0.6851 | 0.6292 | 0.6519 |
4 | 0.5455 | 0.0000 | 0.5333 | 0.5636 | 0.5303 | 0.3678 | 0.3819 |
5 | 0.9091 | 0.0000 | 0.8000 | 0.8485 | 0.8727 | 0.8764 | 0.8866 |
6 | 0.8182 | 0.0000 | 0.8833 | 0.8182 | 0.8182 | 0.7556 | 0.7556 |
7 | 0.7273 | 0.0000 | 0.6667 | 0.6515 | 0.6727 | 0.6163 | 0.6408 |
8 | 0.7273 | 0.0000 | 0.4444 | 0.6545 | 0.6869 | 0.6250 | 0.6402 |
9 | 0.8182 | 0.9773 | 0.7778 | 0.8182 | 0.7939 | 0.7660 | 0.7826 |
Mean | 0.7568 | 0.1886 | 0.6750 | 0.7342 | 0.7332 | 0.6677 | 0.6819 |
SD | 0.0909 | 0.3776 | 0.1414 | 0.1027 | 0.0979 | 0.1297 | 0.1272 |
tuned_rf = tune_model(rf)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7500 | 0.8494 | 0.7500 | 0.6944 | 0.7071 | 0.6571 | 0.6776 |
1 | 0.6364 | 0.0000 | 0.4167 | 0.6515 | 0.5939 | 0.4943 | 0.5319 |
2 | 0.7273 | 0.0000 | 0.6944 | 0.8788 | 0.7896 | 0.6598 | 0.6815 |
3 | 0.6364 | 0.0000 | 0.6500 | 0.7532 | 0.5818 | 0.5165 | 0.6098 |
4 | 0.8182 | 0.0000 | 0.9000 | 0.8909 | 0.8106 | 0.7609 | 0.7957 |
5 | 0.9091 | 0.0000 | 0.8000 | 0.8485 | 0.8727 | 0.8764 | 0.8866 |
6 | 0.7273 | 0.0000 | 0.8167 | 0.7273 | 0.7152 | 0.6413 | 0.6484 |
7 | 0.6364 | 0.0000 | 0.6167 | 0.6273 | 0.6242 | 0.5056 | 0.5115 |
8 | 0.8182 | 0.0000 | 0.6111 | 0.8364 | 0.8141 | 0.7556 | 0.7729 |
9 | 0.8182 | 0.9659 | 0.7778 | 0.8182 | 0.7939 | 0.7660 | 0.7826 |
Mean | 0.7477 | 0.1815 | 0.7033 | 0.7726 | 0.7303 | 0.6633 | 0.6898 |
SD | 0.0889 | 0.3640 | 0.1303 | 0.0903 | 0.0969 | 0.1223 | 0.1142 |
print(tuned_rf)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='entropy', max_depth=4, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=3, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
#plot_model(tuned_lr, plot = 'auc')
plot_model(tuned_rf, plot = 'pr')
plot_model(tuned_rf, plot = 'feature')
plot_model(tuned_rf, plot = 'confusion_matrix')
plot_model(tuned_rf, plot = 'learning')
#plot_model(tuned_rf, plot = 'threshold')
plot_model(tuned_rf, plot = 'boundary')
dataset = get_data(whichDataset)
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | Type | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0.0 | 0.0 | 1 |
1 | 1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0.0 | 0.0 | 1 |
2 | 1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0.0 | 0.0 | 1 |
3 | 1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0.0 | 0.0 | 1 |
4 | 1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0.0 | 0.0 | 1 |
predict_model(tuned_et, data=dataset);
predict_model(tuned_rf);
final_rf = finalize_model(tuned_rf);
# Final model parameters for deployment
print(final_rf)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='entropy', max_depth=4, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=3, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
predict_model(final_rf);
/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
unseen_predictions = predict_model(final_lr, data=data_unseen)
unseen_predictions.head()
from pycaret.utils import check_metric
check_metric(unseen_predictions['type'], unseen_predictions['Label'], metric = 'Accuracy')
check_metric(unseen_predictions['type'], unseen_predictions['Label'], metric = 'Recall')
check_metric(unseen_predictions['type'], unseen_predictions['Label'], metric = 'Precision')
check_metric(unseen_predictions['type'], unseen_predictions['Label'], metric = 'AUC')
check_metric(unseen_predictions['type'], unseen_predictions['Label'], metric = 'F1')
save_model(final_lr,'Final_Model')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head(10)
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'glass'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | Type | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0.0 | 0.0 | 1 |
1 | 1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0.0 | 0.0 | 1 |
2 | 1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0.0 | 0.0 | 1 |
3 | 1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0.0 | 0.0 | 1 |
4 | 1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0.0 | 0.0 | 1 |
Data for Modeling: (160, 10) Unseen Data For Predictions: (54, 10)
clf = setup(data = data, target = 'Type', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | Type |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (160, 10) |
5 | Missing Values | False |
6 | Numeric Features | 9 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (111, 9) |
12 | Transformed Test Set | (49, 9) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 5bcd |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.7288 | 0.0000 | 0.6286 | 0.6754 | 0.6920 | 0.6257 | 0.6388 | 0.2660 |
lightgbm | Light Gradient Boosting Machine | 0.7205 | 0.0000 | 0.5975 | 0.6728 | 0.6780 | 0.6138 | 0.6330 | 0.0240 |
catboost | CatBoost Classifier | 0.7098 | 0.0000 | 0.6311 | 0.6717 | 0.6797 | 0.6021 | 0.6140 | 1.6920 |
xgboost | Extreme Gradient Boosting | 0.6841 | 0.0000 | 0.6044 | 0.6526 | 0.6541 | 0.5694 | 0.5856 | 0.0860 |
gbc | Gradient Boosting Classifier | 0.6833 | 0.0000 | 0.5878 | 0.6476 | 0.6503 | 0.5654 | 0.5852 | 0.1940 |
et | Extra Trees Classifier | 0.6833 | 0.0000 | 0.6178 | 0.6610 | 0.6589 | 0.5675 | 0.5799 | 0.2480 |
knn | K Neighbors Classifier | 0.6379 | 0.0000 | 0.5597 | 0.5850 | 0.5991 | 0.4999 | 0.5142 | 0.0670 |
lda | Linear Discriminant Analysis | 0.6045 | 0.0000 | 0.5350 | 0.5835 | 0.5814 | 0.4606 | 0.4730 | 0.0060 |
ada | Ada Boost Classifier | 0.5765 | 0.0000 | 0.4992 | 0.3865 | 0.4482 | 0.3775 | 0.4913 | 0.0380 |
lr | Logistic Regression | 0.5598 | 0.0000 | 0.4689 | 0.4991 | 0.5102 | 0.3775 | 0.4037 | 0.2910 |
ridge | Ridge Classifier | 0.5598 | 0.0000 | 0.4306 | 0.4945 | 0.5030 | 0.3676 | 0.3996 | 0.0050 |
dt | Decision Tree Classifier | 0.5591 | 0.0000 | 0.4856 | 0.6020 | 0.5576 | 0.4161 | 0.4351 | 0.0060 |
nb | Naive Bayes | 0.4318 | 0.0000 | 0.5331 | 0.5087 | 0.4179 | 0.3030 | 0.3563 | 0.0060 |
dummy | Dummy Classifier | 0.3697 | 0.0000 | 0.2050 | 0.1376 | 0.2002 | 0.0000 | 0.0000 | 0.0050 |
svm | SVM - Linear Kernel | 0.3242 | 0.0000 | 0.3150 | 0.1983 | 0.2269 | 0.1635 | 0.2437 | 0.0360 |
qda | Quadratic Discriminant Analysis | 0.2492 | 0.0000 | 0.2817 | 0.1757 | 0.1704 | 0.1230 | 0.1444 | 0.0080 |
top_models
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=1234, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), <catboost.core.CatBoostClassifier at 0x7f1b73218c10>, XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='auto', validate_parameters=1, verbosity=0), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.6667 | 0.0000 | 0.5500 | 0.6167 | 0.6389 | 0.5514 | 0.5570 |
1 | 0.9091 | 0.0000 | 0.9167 | 0.9545 | 0.9152 | 0.8690 | 0.8796 |
2 | 0.6364 | 0.0000 | 0.5000 | 0.5682 | 0.5974 | 0.5000 | 0.5060 |
3 | 0.6364 | 0.0000 | 0.4667 | 0.5260 | 0.5554 | 0.4699 | 0.5139 |
4 | 0.5455 | 0.0000 | 0.4500 | 0.5273 | 0.5076 | 0.4022 | 0.4206 |
5 | 0.7273 | 0.0000 | 0.5500 | 0.6045 | 0.6580 | 0.6118 | 0.6287 |
6 | 0.6364 | 0.0000 | 0.4833 | 0.5818 | 0.6061 | 0.4943 | 0.5005 |
7 | 0.7273 | 0.0000 | 0.6667 | 0.7532 | 0.6736 | 0.6118 | 0.6747 |
8 | 0.5455 | 0.0000 | 0.4333 | 0.4636 | 0.4993 | 0.3529 | 0.3627 |
9 | 0.9091 | 0.0000 | 0.9000 | 0.9318 | 0.9004 | 0.8764 | 0.8866 |
Mean | 0.6939 | 0.0000 | 0.5917 | 0.6528 | 0.6552 | 0.5740 | 0.5930 |
SD | 0.1223 | 0.0000 | 0.1705 | 0.1620 | 0.1379 | 0.1682 | 0.1681 |
tuned_top_models
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=0.4, importance_type='split', learning_rate=0.05, max_depth=-1, min_child_samples=21, min_child_weight=0.001, min_split_gain=0.3, n_estimators=170, n_jobs=-1, num_leaves=2, objective=None, random_state=1234, reg_alpha=0.01, reg_lambda=3, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), <catboost.core.CatBoostClassifier at 0x7f1b7214fe50>, XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=1, min_child_weight=3, missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0.5, reg_lambda=1e-06, scale_pos_weight=23.700000000000003, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.15, loss='deviance', max_depth=7, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.05, min_impurity_split=None, min_samples_leaf=4, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=290, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=0.55, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)]
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7500 | 0.0000 | 0.6000 | 0.6389 | 0.6833 | 0.6571 | 0.6786 |
1 | 1.0000 | 0.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
2 | 0.7273 | 0.0000 | 0.5500 | 0.6045 | 0.6580 | 0.6118 | 0.6287 |
3 | 0.6364 | 0.0000 | 0.4667 | 0.5606 | 0.5818 | 0.4884 | 0.5078 |
4 | 0.5455 | 0.0000 | 0.3750 | 0.5000 | 0.4848 | 0.3956 | 0.4353 |
5 | 0.7273 | 0.0000 | 0.5500 | 0.6045 | 0.6580 | 0.6118 | 0.6287 |
6 | 0.6364 | 0.0000 | 0.4833 | 0.5364 | 0.5801 | 0.4824 | 0.4957 |
7 | 0.6364 | 0.0000 | 0.6167 | 0.5909 | 0.6000 | 0.4884 | 0.5078 |
8 | 0.6364 | 0.0000 | 0.6333 | 0.5909 | 0.6104 | 0.5000 | 0.5060 |
9 | 0.7273 | 0.0000 | 0.6000 | 0.5455 | 0.6187 | 0.6118 | 0.6551 |
Mean | 0.7023 | 0.0000 | 0.5875 | 0.6172 | 0.6475 | 0.5847 | 0.6044 |
SD | 0.1161 | 0.0000 | 0.1571 | 0.1332 | 0.1287 | 0.1587 | 0.1531 |
bagged_top_models
[BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=0.4, importance_type='split', learning_rate=0.05, max_depth=-1, min_child_samples=21, min_child_weight=0.001, min_split_gain=0.3, n_estimators=170, n_jobs=-1, num_leaves=2, objective=None, random_state=1234, reg_alpha=0.01, reg_lambda=3, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f1b71f7a990>, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=1, min_child_weight=3, missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0.5, reg_lambda=1e-06, scale_pos_weight=23.700000000000003, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.15, loss='deviance', max_depth=7, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.05, min_impurity_split=None, min_samples_leaf=4, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=290, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=0.55, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False)]
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
Best model based on AUC: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=1234, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False) Best model based on Accuracy: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=1, min_child_weight=3, missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0.5, reg_lambda=1e-06, scale_pos_weight=23.700000000000003, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0) Best model based on Recall: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on Precision: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on F1: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(best2, plot = 'auc')
plot_model(best2, plot = 'confusion_matrix')
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
plot_model(best2, plot = 'learning')
save_model(best2,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='Type', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strateg... interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=1, min_child_weight=3, missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0.5, reg_lambda=1e-06, scale_pos_weight=23.700000000000003, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.