For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# OpenML Dataset ID
whichDataset = 6 # provide dataset id
import openml
from openml.datasets import get_dataset
dataset = openml.datasets.get_dataset(whichDataset)
# Print a summary
print(
f"This is dataset '{dataset.name}', the target feature is "
f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description)
This is dataset 'letter', the target feature is 'class' URL: https://www.openml.org/data/v1/download/6/letter.arff **Author**: David J. Slate **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 **Please cite**: P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". Machine Learning 6(2), 1991 1. TITLE: Letter Image Recognition Data The objective is to identify each of a large number of black-and-white rectangular pixel displays as one of the 26 capital letters in the English alphabet. The character images were based on 20 different fonts and each letter within these 20 fonts was randomly distorted to produce a file of 20,000 unique stimuli. Each stimulus was converted into 16 primitive numerical attributes (statistical moments and edge counts) which were then scaled to fit into a range of integer values from 0 through 15. We typically train on the first 16000 items and then use the resulting model to predict the letter category for the remaining 4000. See the article cited above for more details.
import warnings
warnings.filterwarnings("ignore")
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute)
dataset = pd.DataFrame(X, columns=attribute_names)
dataset["target"] = y
dataset.shape
(20000, 17)
dataset.head()
x-box | y-box | width | high | onpix | x-bar | y-bar | x2bar | y2bar | xybar | x2ybr | xy2br | x-ege | xegvy | y-ege | yegvx | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.0 | 4.0 | 4.0 | 3.0 | 2.0 | 7.0 | 8.0 | 2.0 | 9.0 | 11.0 | 7.0 | 7.0 | 1.0 | 8.0 | 5.0 | 6.0 | 25 |
1 | 4.0 | 7.0 | 5.0 | 5.0 | 5.0 | 5.0 | 9.0 | 6.0 | 4.0 | 8.0 | 7.0 | 9.0 | 2.0 | 9.0 | 7.0 | 10.0 | 15 |
2 | 7.0 | 10.0 | 8.0 | 7.0 | 4.0 | 8.0 | 8.0 | 5.0 | 10.0 | 11.0 | 2.0 | 8.0 | 2.0 | 5.0 | 5.0 | 10.0 | 18 |
3 | 4.0 | 9.0 | 5.0 | 7.0 | 4.0 | 7.0 | 7.0 | 13.0 | 1.0 | 7.0 | 6.0 | 8.0 | 3.0 | 8.0 | 0.0 | 8.0 | 7 |
4 | 6.0 | 7.0 | 8.0 | 5.0 | 4.0 | 7.0 | 6.0 | 3.0 | 7.0 | 10.0 | 7.0 | 9.0 | 3.0 | 8.0 | 3.0 | 7.0 | 7 |
#dataset.columns.to_list()
# find missing values in data frame
dataset.isnull().sum().sum()
0
dataset = dataset.fillna(0)
dataset.isnull().sum().sum()
0
# group by 'target'
dataset.groupby('target').count()
x-box | y-box | width | high | onpix | x-bar | y-bar | x2bar | y2bar | xybar | x2ybr | xy2br | x-ege | xegvy | y-ege | yegvx | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
target | ||||||||||||||||
0 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 | 789 |
1 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 | 766 |
2 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 | 736 |
3 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 | 805 |
4 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 | 768 |
5 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 | 775 |
6 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 | 773 |
7 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 |
8 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 | 755 |
9 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 | 747 |
10 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 |
11 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 | 761 |
12 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 | 792 |
13 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 |
14 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 | 753 |
15 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 | 803 |
16 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 | 783 |
17 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 | 758 |
18 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 | 748 |
19 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 | 796 |
20 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 |
21 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 | 764 |
22 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 | 752 |
23 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 | 787 |
24 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 | 786 |
25 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 | 734 |
# training and test data split
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (15000, 17) Unseen Data For Predictions: (5000, 17)
env_setup = setup(data = data, target = 'target', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | target |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (15000, 17) |
5 | Missing Values | False |
6 | Numeric Features | 16 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (10499, 16) |
12 | Transformed Test Set | (4501, 16) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 1114 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models(exclude = ['catboost'], sort = 'Accuracy')
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
et | Extra Trees Classifier | 0.9566 | 0.9992 | 0.9564 | 0.9578 | 0.9566 | 0.9548 | 0.9549 | 0.6230 |
lightgbm | Light Gradient Boosting Machine | 0.9524 | 0.9992 | 0.9522 | 0.9536 | 0.9525 | 0.9505 | 0.9505 | 1.6180 |
rf | Random Forest Classifier | 0.9478 | 0.9991 | 0.9476 | 0.9493 | 0.9478 | 0.9457 | 0.9458 | 0.8200 |
xgboost | Extreme Gradient Boosting | 0.9463 | 0.9992 | 0.9461 | 0.9475 | 0.9463 | 0.9441 | 0.9442 | 9.2070 |
knn | K Neighbors Classifier | 0.9343 | 0.9950 | 0.9339 | 0.9369 | 0.9345 | 0.9316 | 0.9317 | 0.1540 |
gbc | Gradient Boosting Classifier | 0.9082 | 0.9975 | 0.9078 | 0.9111 | 0.9084 | 0.9045 | 0.9046 | 14.4530 |
qda | Quadratic Discriminant Analysis | 0.8826 | 0.9957 | 0.8823 | 0.8865 | 0.8825 | 0.8779 | 0.8780 | 0.0230 |
dt | Decision Tree Classifier | 0.8380 | 0.9158 | 0.8378 | 0.8410 | 0.8381 | 0.8315 | 0.8316 | 0.0340 |
lr | Logistic Regression | 0.7770 | 0.9800 | 0.7756 | 0.7786 | 0.7757 | 0.7681 | 0.7683 | 4.2490 |
lda | Linear Discriminant Analysis | 0.7000 | 0.9671 | 0.6993 | 0.7139 | 0.7004 | 0.6880 | 0.6885 | 0.0240 |
nb | Naive Bayes | 0.6377 | 0.9560 | 0.6373 | 0.6537 | 0.6336 | 0.6232 | 0.6241 | 0.0200 |
ridge | Ridge Classifier | 0.5603 | 0.0000 | 0.5597 | 0.5841 | 0.5231 | 0.5427 | 0.5465 | 0.0120 |
svm | SVM - Linear Kernel | 0.5567 | 0.0000 | 0.5547 | 0.6837 | 0.5433 | 0.5389 | 0.5475 | 0.5110 |
ada | Ada Boost Classifier | 0.2197 | 0.7295 | 0.2199 | 0.2224 | 0.1743 | 0.1884 | 0.1948 | 0.3320 |
dummy | Dummy Classifier | 0.0413 | 0.5000 | 0.0385 | 0.0017 | 0.0033 | 0.0000 | 0.0000 | 0.0120 |
model_1 = create_model('et')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9610 | 0.9997 | 0.9615 | 0.9623 | 0.9610 | 0.9594 | 0.9594 |
1 | 0.9543 | 0.9993 | 0.9535 | 0.9552 | 0.9541 | 0.9525 | 0.9525 | |
2 | 0.9629 | 0.9987 | 0.9625 | 0.9638 | 0.9629 | 0.9614 | 0.9614 | |
3 | 0.9533 | 0.9994 | 0.9535 | 0.9544 | 0.9535 | 0.9515 | 0.9515 | |
4 | 0.9457 | 0.9990 | 0.9449 | 0.9472 | 0.9455 | 0.9435 | 0.9436 | |
5 | 0.9600 | 0.9995 | 0.9598 | 0.9614 | 0.9602 | 0.9584 | 0.9584 | |
6 | 0.9581 | 0.9994 | 0.9576 | 0.9594 | 0.9582 | 0.9564 | 0.9565 | |
7 | 0.9514 | 0.9985 | 0.9509 | 0.9529 | 0.9514 | 0.9495 | 0.9495 | |
8 | 0.9562 | 0.9993 | 0.9563 | 0.9573 | 0.9561 | 0.9544 | 0.9545 | |
9 | 0.9628 | 0.9990 | 0.9632 | 0.9637 | 0.9629 | 0.9613 | 0.9614 | |
Mean | 0.9566 | 0.9992 | 0.9564 | 0.9578 | 0.9566 | 0.9548 | 0.9549 | |
Std | 0.0052 | 0.0004 | 0.0055 | 0.0051 | 0.0053 | 0.0054 | 0.0054 | |
Train | nan | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
tuned_model_1 = tune_model(model_1)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.8962 | 0.9968 | 0.8963 | 0.9060 | 0.8964 | 0.8920 | 0.8924 |
1 | 0.8810 | 0.9958 | 0.8793 | 0.8956 | 0.8824 | 0.8762 | 0.8767 | |
2 | 0.8743 | 0.9951 | 0.8735 | 0.8876 | 0.8771 | 0.8692 | 0.8697 | |
3 | 0.8790 | 0.9960 | 0.8785 | 0.8893 | 0.8813 | 0.8742 | 0.8745 | |
4 | 0.8629 | 0.9950 | 0.8619 | 0.8773 | 0.8649 | 0.8574 | 0.8578 | |
5 | 0.8771 | 0.9957 | 0.8761 | 0.8865 | 0.8784 | 0.8722 | 0.8725 | |
6 | 0.8771 | 0.9953 | 0.8768 | 0.8889 | 0.8784 | 0.8722 | 0.8726 | |
7 | 0.8752 | 0.9947 | 0.8745 | 0.8848 | 0.8760 | 0.8702 | 0.8706 | |
8 | 0.8886 | 0.9956 | 0.8886 | 0.8974 | 0.8898 | 0.8841 | 0.8844 | |
9 | 0.8875 | 0.9960 | 0.8882 | 0.9006 | 0.8901 | 0.8830 | 0.8834 | |
Mean | 0.8799 | 0.9956 | 0.8794 | 0.8914 | 0.8815 | 0.8751 | 0.8754 | |
Std | 0.0087 | 0.0006 | 0.0091 | 0.0080 | 0.0084 | 0.0091 | 0.0090 | |
Train | nan | 0.9296 | 0.9981 | 0.9293 | 0.9345 | 0.9305 | 0.9268 | 0.9269 |
print(tuned_model_1)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=9, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0005, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
tuned_model_1 = tune_model(model_1, n_iter=100)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9067 | 0.9971 | 0.9069 | 0.9166 | 0.9075 | 0.9029 | 0.9033 |
1 | 0.8952 | 0.9964 | 0.8942 | 0.9055 | 0.8959 | 0.8910 | 0.8914 | |
2 | 0.8886 | 0.9963 | 0.8882 | 0.8954 | 0.8895 | 0.8841 | 0.8843 | |
3 | 0.8943 | 0.9968 | 0.8944 | 0.9023 | 0.8961 | 0.8901 | 0.8903 | |
4 | 0.8657 | 0.9959 | 0.8649 | 0.8772 | 0.8674 | 0.8603 | 0.8607 | |
5 | 0.8829 | 0.9966 | 0.8826 | 0.8901 | 0.8837 | 0.8782 | 0.8784 | |
6 | 0.8876 | 0.9961 | 0.8875 | 0.8959 | 0.8885 | 0.8831 | 0.8834 | |
7 | 0.8838 | 0.9956 | 0.8830 | 0.8910 | 0.8846 | 0.8791 | 0.8794 | |
8 | 0.8990 | 0.9965 | 0.8991 | 0.9054 | 0.8994 | 0.8950 | 0.8952 | |
9 | 0.8951 | 0.9964 | 0.8961 | 0.9053 | 0.8972 | 0.8909 | 0.8912 | |
Mean | 0.8899 | 0.9964 | 0.8897 | 0.8985 | 0.8910 | 0.8855 | 0.8858 | |
Std | 0.0106 | 0.0004 | 0.0109 | 0.0104 | 0.0105 | 0.0110 | 0.0110 | |
Train | nan | 0.9402 | 0.9987 | 0.9400 | 0.9429 | 0.9407 | 0.9378 | 0.9379 |
print(tuned_model_1)
ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='entropy', max_depth=10, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(tuned_model_1, plot = 'auc')
plot_model(tuned_model_1, plot = 'pr')
#plot_model(tuned_model_1, plot='feature')
plot_model(tuned_model_1, plot = 'confusion_matrix')
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
plot_model(tuned_model_1, plot = 'learning')
#plot_model(tuned_model_1, plot = 'threshold')
plot_model(tuned_model_1, plot = 'boundary')
plot_model(tuned_model_1, plot = 'error')
model_2 = create_model('xgboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9552 | 0.9995 | 0.9555 | 0.9564 | 0.9553 | 0.9534 | 0.9535 |
1 | 0.9505 | 0.9992 | 0.9496 | 0.9516 | 0.9504 | 0.9485 | 0.9485 | |
2 | 0.9467 | 0.9993 | 0.9468 | 0.9480 | 0.9466 | 0.9445 | 0.9446 | |
3 | 0.9476 | 0.9994 | 0.9476 | 0.9485 | 0.9477 | 0.9455 | 0.9455 | |
4 | 0.9324 | 0.9989 | 0.9314 | 0.9344 | 0.9323 | 0.9297 | 0.9298 | |
5 | 0.9486 | 0.9988 | 0.9488 | 0.9501 | 0.9486 | 0.9465 | 0.9466 | |
6 | 0.9552 | 0.9995 | 0.9547 | 0.9560 | 0.9551 | 0.9534 | 0.9535 | |
7 | 0.9410 | 0.9989 | 0.9408 | 0.9420 | 0.9409 | 0.9386 | 0.9386 | |
8 | 0.9400 | 0.9990 | 0.9402 | 0.9409 | 0.9398 | 0.9376 | 0.9376 | |
9 | 0.9457 | 0.9991 | 0.9460 | 0.9467 | 0.9458 | 0.9435 | 0.9435 | |
Mean | 0.9463 | 0.9992 | 0.9461 | 0.9475 | 0.9463 | 0.9441 | 0.9442 | |
Std | 0.0067 | 0.0002 | 0.0068 | 0.0065 | 0.0067 | 0.0070 | 0.0069 | |
Train | nan | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
tuned_model_2 = tune_model(model_2)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9505 | 0.9995 | 0.9506 | 0.9521 | 0.9505 | 0.9485 | 0.9486 |
1 | 0.9543 | 0.9991 | 0.9534 | 0.9554 | 0.9543 | 0.9525 | 0.9525 | |
2 | 0.9514 | 0.9992 | 0.9512 | 0.9530 | 0.9514 | 0.9495 | 0.9495 | |
3 | 0.9448 | 0.9993 | 0.9446 | 0.9464 | 0.9450 | 0.9425 | 0.9426 | |
4 | 0.9371 | 0.9991 | 0.9365 | 0.9387 | 0.9372 | 0.9346 | 0.9347 | |
5 | 0.9429 | 0.9989 | 0.9429 | 0.9445 | 0.9429 | 0.9406 | 0.9406 | |
6 | 0.9552 | 0.9995 | 0.9545 | 0.9564 | 0.9551 | 0.9534 | 0.9535 | |
7 | 0.9467 | 0.9986 | 0.9462 | 0.9476 | 0.9467 | 0.9445 | 0.9446 | |
8 | 0.9448 | 0.9993 | 0.9448 | 0.9457 | 0.9444 | 0.9425 | 0.9426 | |
9 | 0.9438 | 0.9990 | 0.9442 | 0.9453 | 0.9440 | 0.9415 | 0.9415 | |
Mean | 0.9471 | 0.9991 | 0.9469 | 0.9485 | 0.9472 | 0.9450 | 0.9451 | |
Std | 0.0054 | 0.0003 | 0.0052 | 0.0053 | 0.0053 | 0.0056 | 0.0056 | |
Train | nan | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
print(tuned_model_2)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)
plot_model(tuned_model_2, plot = 'auc')
#plot_model(tuned_model_2, plot = 'pr')
plot_model(tuned_model_2, plot = 'feature')
plot_model(tuned_model_2, plot = 'confusion_matrix')
#plot_model(tuned_model_2, plot = 'learning')
#plot_model(tuned_model_2, plot = 'threshold')
plot_model(tuned_model_2, plot = 'boundary')
plot_model(tuned_model_2, plot = 'error')
predict_model(tuned_model_1);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extra Trees Classifier | 0.9011 | 0.9970 | 0.9009 | 0.9073 | 0.9022 | 0.8972 | 0.8973 |
predict_model(tuned_model_2);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 0.9509 | 0.9994 | 0.9505 | 0.9517 | 0.9510 | 0.9489 | 0.9490 |
final_model = finalize_model(tuned_model_2);
# Final model parameters for deployment
print(final_model)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)
predict_model(final_model);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
unseen_predictions = predict_model(final_model, data=data_unseen)
unseen_predictions.head()
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 0.9572 | 0.9996 | 0.9566 | 0.9575 | 0.9572 | 0.9555 | 0.9555 |
x-box | y-box | width | high | onpix | x-bar | y-bar | x2bar | y2bar | xybar | x2ybr | xy2br | x-ege | xegvy | y-ege | yegvx | target | Label | Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.0 | 9.0 | 5.0 | 7.0 | 4.0 | 7.0 | 7.0 | 13.0 | 1.0 | 7.0 | 6.0 | 8.0 | 3.0 | 8.0 | 0.0 | 8.0 | 7 | 7 | 0.9989 |
1 | 2.0 | 7.0 | 4.0 | 5.0 | 1.0 | 9.0 | 8.0 | 4.0 | 2.0 | 5.0 | 13.0 | 8.0 | 3.0 | 10.0 | 0.0 | 8.0 | 21 | 21 | 0.9997 |
2 | 7.0 | 11.0 | 10.0 | 8.0 | 9.0 | 9.0 | 7.0 | 3.0 | 6.0 | 10.0 | 4.0 | 7.0 | 5.0 | 6.0 | 4.0 | 9.0 | 7 | 7 | 0.9762 |
3 | 2.0 | 4.0 | 5.0 | 3.0 | 2.0 | 7.0 | 8.0 | 2.0 | 9.0 | 11.0 | 7.0 | 7.0 | 1.0 | 8.0 | 5.0 | 6.0 | 25 | 25 | 0.9981 |
4 | 5.0 | 10.0 | 7.0 | 9.0 | 3.0 | 8.0 | 5.0 | 9.0 | 8.0 | 6.0 | 4.0 | 8.0 | 3.0 | 8.0 | 4.0 | 8.0 | 16 | 16 | 0.8965 |
from pycaret.utils import check_metric
check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Accuracy')
0.9572
check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Recall')
0.9566
check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Precision')
0.9575
#check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'AUC')
check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'F1')
0.9572
save_model(final_model,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='target', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strat... interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
Transformation Pipeline and Model Successfully Loaded
new_prediction = predict_model(load_saved_model, data=data_unseen)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extreme Gradient Boosting | 0.9572 | 0.9996 | 0.9566 | 0.9575 | 0.9572 | 0.9555 | 0.9555 |
new_prediction[["Label", "Score"]].head()
Label | Score | |
---|---|---|
0 | 7 | 0.9989 |
1 | 21 | 0.9997 |
2 | 7 | 0.9762 |
3 | 25 | 0.9981 |
4 | 16 | 0.8965 |
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# OpenML Dataset ID
whichDataset = 6 # provide dataset id
import openml
from openml.datasets import get_dataset
dataset = openml.datasets.get_dataset(whichDataset)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute)
dataset = pd.DataFrame(X, columns=attribute_names)
dataset["target"] = y
data = dataset.sample(frac=0.70, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (14000, 17) Unseen Data For Predictions: (6000, 17)
clf = setup(data = data, target = 'target', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | target |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (14000, 17) |
5 | Missing Values | False |
6 | Numeric Features | 16 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (9799, 16) |
12 | Transformed Test Set | (4201, 16) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 59ba |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 3, exclude = ['catboost', 'lightgbm'], sort = 'Accuracy')
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
et | Extra Trees Classifier | 0.9578 | 0.9992 | 0.9576 | 0.9591 | 0.9578 | 0.9561 | 0.9561 | 0.5870 |
rf | Random Forest Classifier | 0.9483 | 0.9988 | 0.9480 | 0.9503 | 0.9483 | 0.9462 | 0.9463 | 0.6600 |
xgboost | Extreme Gradient Boosting | 0.9403 | 0.9990 | 0.9401 | 0.9421 | 0.9404 | 0.9379 | 0.9380 | 8.5670 |
knn | K Neighbors Classifier | 0.9290 | 0.9933 | 0.9286 | 0.9316 | 0.9291 | 0.9261 | 0.9262 | 0.1440 |
gbc | Gradient Boosting Classifier | 0.9041 | 0.9971 | 0.9037 | 0.9070 | 0.9043 | 0.9002 | 0.9003 | 12.7930 |
qda | Quadratic Discriminant Analysis | 0.8788 | 0.9956 | 0.8784 | 0.8829 | 0.8789 | 0.8739 | 0.8741 | 0.0210 |
dt | Decision Tree Classifier | 0.8413 | 0.9175 | 0.8408 | 0.8453 | 0.8414 | 0.8350 | 0.8351 | 0.0320 |
lr | Logistic Regression | 0.7695 | 0.9790 | 0.7685 | 0.7720 | 0.7682 | 0.7602 | 0.7604 | 3.5600 |
lda | Linear Discriminant Analysis | 0.6976 | 0.9662 | 0.6968 | 0.7143 | 0.6972 | 0.6855 | 0.6862 | 0.0230 |
nb | Naive Bayes | 0.6407 | 0.9559 | 0.6399 | 0.6572 | 0.6367 | 0.6263 | 0.6272 | 0.0200 |
ridge | Ridge Classifier | 0.5473 | 0.0000 | 0.5466 | 0.5818 | 0.5085 | 0.5292 | 0.5332 | 0.0110 |
svm | SVM - Linear Kernel | 0.5361 | 0.0000 | 0.5349 | 0.6464 | 0.5163 | 0.5174 | 0.5279 | 0.4470 |
ada | Ada Boost Classifier | 0.2418 | 0.8290 | 0.2423 | 0.2611 | 0.2106 | 0.2115 | 0.2157 | 0.2800 |
dummy | Dummy Classifier | 0.0404 | 0.5000 | 0.0385 | 0.0016 | 0.0031 | 0.0000 | 0.0000 | 0.0120 |
/opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) /opt/conda/lib/python3.7/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: The outputs_2d_ attribute is deprecated in version 0.22 and will be removed in version 0.24. It is equivalent to n_outputs_ > 1. warnings.warn(msg, category=FutureWarning) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:846: RuntimeWarning: invalid value encountered in double_scalars mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
top_models
[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='auto', validate_parameters=1, verbosity=0)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9500 | 0.9995 | 0.9494 | 0.9517 | 0.9503 | 0.9480 | 0.9480 |
1 | 0.9582 | 0.9993 | 0.9585 | 0.9593 | 0.9582 | 0.9565 | 0.9565 | |
2 | 0.9388 | 0.9992 | 0.9380 | 0.9400 | 0.9387 | 0.9363 | 0.9364 | |
3 | 0.9378 | 0.9991 | 0.9370 | 0.9389 | 0.9376 | 0.9353 | 0.9353 | |
4 | 0.9408 | 0.9989 | 0.9403 | 0.9422 | 0.9410 | 0.9384 | 0.9385 | |
5 | 0.9459 | 0.9988 | 0.9452 | 0.9471 | 0.9460 | 0.9438 | 0.9438 | |
6 | 0.9418 | 0.9989 | 0.9417 | 0.9428 | 0.9418 | 0.9395 | 0.9396 | |
7 | 0.9469 | 0.9989 | 0.9466 | 0.9495 | 0.9470 | 0.9448 | 0.9449 | |
8 | 0.9306 | 0.9989 | 0.9310 | 0.9319 | 0.9300 | 0.9278 | 0.9279 | |
9 | 0.9551 | 0.9992 | 0.9549 | 0.9562 | 0.9548 | 0.9533 | 0.9533 | |
Mean | 0.9446 | 0.9991 | 0.9443 | 0.9460 | 0.9445 | 0.9424 | 0.9424 | |
Std | 0.0079 | 0.0002 | 0.0080 | 0.0080 | 0.0080 | 0.0082 | 0.0082 | |
Train | nan | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
tuned_top_models
[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=9, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0005, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced', criterion='entropy', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=6, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=140, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0)]
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | ||
---|---|---|---|---|---|---|---|---|
Split | Fold | |||||||
CV-Val | 0 | 0.9459 | 0.9993 | 0.9455 | 0.9488 | 0.9464 | 0.9438 | 0.9438 |
1 | 0.9520 | 0.9991 | 0.9524 | 0.9535 | 0.9520 | 0.9501 | 0.9502 | |
2 | 0.9347 | 0.9993 | 0.9341 | 0.9364 | 0.9348 | 0.9321 | 0.9321 | |
3 | 0.9276 | 0.9988 | 0.9271 | 0.9295 | 0.9277 | 0.9246 | 0.9247 | |
4 | 0.9388 | 0.9988 | 0.9382 | 0.9408 | 0.9392 | 0.9363 | 0.9364 | |
5 | 0.9398 | 0.9988 | 0.9393 | 0.9416 | 0.9400 | 0.9374 | 0.9374 | |
6 | 0.9459 | 0.9992 | 0.9461 | 0.9477 | 0.9458 | 0.9438 | 0.9438 | |
7 | 0.9439 | 0.9988 | 0.9435 | 0.9463 | 0.9440 | 0.9416 | 0.9417 | |
8 | 0.9265 | 0.9986 | 0.9271 | 0.9288 | 0.9263 | 0.9236 | 0.9237 | |
9 | 0.9510 | 0.9992 | 0.9511 | 0.9526 | 0.9508 | 0.9490 | 0.9491 | |
Mean | 0.9406 | 0.9990 | 0.9404 | 0.9426 | 0.9407 | 0.9382 | 0.9383 | |
Std | 0.0084 | 0.0002 | 0.0085 | 0.0084 | 0.0084 | 0.0088 | 0.0088 | |
Train | nan | 0.9964 | 1.0000 | 0.9964 | 0.9964 | 0.9964 | 0.9963 | 0.9963 |
bagged_top_models
[BaggingClassifier(base_estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=9, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0005, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced', criterion='entropy', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.001, min_impurity_split=None, min_samples_leaf=6, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=140, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False)]
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
Best model based on AUC: <catboost.core.CatBoostClassifier object at 0x7f2ae0fbb590> Best model based on Accuracy: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on Recall: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on Precision: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on F1: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(best2, plot = 'auc')
plot_model(best2, plot = 'confusion_matrix')
plot_model(best2, plot = 'learning')
save_model(best2,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='target', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strat... ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
Transformation Pipeline and Model Successfully Loaded
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extra Trees Classifier | 0.9627 | 0.9995 | 0.9627 | 0.9630 | 0.9627 | 0.9612 | 0.9612 |
Label | Score | |
---|---|---|
0 | 23 | 0.98 |
1 | 3 | 1.00 |
2 | 25 | 0.70 |
3 | 13 | 0.98 |
4 | 5 | 0.57 |
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.