For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'poker'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
S1 | C1 | S2 | C2 | S3 | C3 | S4 | C4 | S5 | C5 | CLASS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 13 | 2 | 4 | 2 | 3 | 1 | 12 | 0 |
1 | 3 | 12 | 3 | 2 | 3 | 11 | 4 | 5 | 2 | 5 | 1 |
2 | 1 | 9 | 4 | 6 | 1 | 4 | 3 | 2 | 3 | 9 | 1 |
3 | 1 | 4 | 3 | 13 | 2 | 13 | 2 | 1 | 3 | 6 | 1 |
4 | 3 | 10 | 2 | 7 | 1 | 2 | 2 | 11 | 4 | 9 | 0 |
dataset.shape
(99998, 11)
dataset.columns.to_list()
['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (74998, 11) Unseen Data For Predictions: (25000, 11)
env_setup = setup(data = data, target = 'CLASS', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | CLASS |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (74998, 11) |
5 | Missing Values | False |
6 | Numeric Features | 0 |
7 | Categorical Features | 10 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (52498, 85) |
12 | Transformed Test Set | (22500, 85) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | e223 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
catboost | CatBoost Classifier | 0.9926 | 0.8990 | 0.5285 | 0.9869 | 0.9892 | 0.9869 | 0.9870 | 26.2810 |
xgboost | Extreme Gradient Boosting | 0.9330 | 0.8743 | 0.3647 | 0.9136 | 0.9091 | 0.8767 | 0.8818 | 42.1190 |
lightgbm | Light Gradient Boosting Machine | 0.8322 | 0.7993 | 0.2863 | 0.8070 | 0.8069 | 0.6899 | 0.6956 | 1.3840 |
et | Extra Trees Classifier | 0.7664 | 0.8063 | 0.2100 | 0.7425 | 0.7313 | 0.5590 | 0.5713 | 5.2300 |
gbc | Gradient Boosting Classifier | 0.7588 | 0.8260 | 0.2244 | 0.7377 | 0.7217 | 0.5423 | 0.5703 | 27.9410 |
rf | Random Forest Classifier | 0.7473 | 0.7985 | 0.2026 | 0.7072 | 0.7102 | 0.5214 | 0.5375 | 3.8100 |
dt | Decision Tree Classifier | 0.5562 | 0.5537 | 0.2211 | 0.5589 | 0.5574 | 0.2222 | 0.2223 | 0.2920 |
knn | K Neighbors Classifier | 0.5406 | 0.5551 | 0.1575 | 0.5137 | 0.5137 | 0.1339 | 0.1371 | 41.7780 |
dummy | Dummy Classifier | 0.5003 | 0.4500 | 0.1268 | 0.2503 | 0.3337 | 0.0000 | 0.0000 | 0.0270 |
svm | SVM - Linear Kernel | 0.4991 | 0.0000 | 0.1269 | 0.4305 | 0.3465 | 0.0012 | 0.0035 | 0.8310 |
lr | Logistic Regression | 0.4989 | 0.4503 | 0.1269 | 0.4342 | 0.3507 | 0.0009 | 0.0031 | 4.7410 |
ridge | Ridge Classifier | 0.4988 | 0.0000 | 0.1269 | 0.4338 | 0.3503 | 0.0008 | 0.0030 | 0.0520 |
lda | Linear Discriminant Analysis | 0.4988 | 0.4506 | 0.1269 | 0.4341 | 0.3508 | 0.0009 | 0.0032 | 0.3260 |
ada | Ada Boost Classifier | 0.4221 | 0.4480 | 0.1078 | 0.4376 | 0.3775 | 0.0098 | 0.0126 | 1.0200 |
nb | Naive Bayes | 0.3032 | 0.4526 | 0.1585 | 0.4258 | 0.3450 | 0.0013 | 0.0012 | 0.0590 |
qda | Quadratic Discriminant Analysis | 0.0015 | 0.4500 | 0.1234 | 0.0000 | 0.0000 | -0.0000 | -0.0001 | 0.3230 |
et = create_model('et')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7808 | 0.9057 | 0.2147 | 0.7649 | 0.7465 | 0.5866 | 0.5988 |
1 | 0.7613 | 0.8956 | 0.2041 | 0.7503 | 0.7257 | 0.5490 | 0.5621 |
2 | 0.7598 | 0.8888 | 0.2049 | 0.7349 | 0.7251 | 0.5467 | 0.5588 |
3 | 0.7630 | 0.8947 | 0.2053 | 0.7729 | 0.7273 | 0.5522 | 0.5656 |
4 | 0.7644 | 0.8985 | 0.2065 | 0.7177 | 0.7293 | 0.5554 | 0.5671 |
5 | 0.7630 | 0.8957 | 0.2039 | 0.7030 | 0.7274 | 0.5527 | 0.5650 |
6 | 0.7720 | 0.9001 | 0.2074 | 0.7595 | 0.7370 | 0.5699 | 0.5813 |
7 | 0.7632 | 0.8953 | 0.2075 | 0.7450 | 0.7290 | 0.5535 | 0.5655 |
8 | 0.7754 | 0.0000 | 0.2394 | 0.7352 | 0.7401 | 0.5761 | 0.5882 |
9 | 0.7605 | 0.8890 | 0.2061 | 0.7412 | 0.7257 | 0.5478 | 0.5608 |
Mean | 0.7664 | 0.8063 | 0.2100 | 0.7425 | 0.7313 | 0.5590 | 0.5713 |
SD | 0.0068 | 0.2688 | 0.0102 | 0.0202 | 0.0069 | 0.0130 | 0.0127 |
tuned_et = tune_model(et)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.5787 | 0.6167 | 0.3833 | 0.6240 | 0.5647 | 0.2878 | 0.3076 |
1 | 0.5642 | 0.6140 | 0.3707 | 0.6044 | 0.5534 | 0.2645 | 0.2803 |
2 | 0.5490 | 0.6058 | 0.3451 | 0.5919 | 0.5454 | 0.2481 | 0.2603 |
3 | 0.5726 | 0.6165 | 0.3450 | 0.6205 | 0.5680 | 0.2840 | 0.2990 |
4 | 0.5750 | 0.6208 | 0.3565 | 0.6158 | 0.5671 | 0.2843 | 0.2996 |
5 | 0.5590 | 0.6146 | 0.3507 | 0.6022 | 0.5536 | 0.2603 | 0.2739 |
6 | 0.5613 | 0.6068 | 0.3395 | 0.5963 | 0.5577 | 0.2627 | 0.2736 |
7 | 0.5648 | 0.6178 | 0.3700 | 0.6104 | 0.5589 | 0.2737 | 0.2883 |
8 | 0.5656 | 0.0000 | 0.3902 | 0.6075 | 0.5592 | 0.2708 | 0.2849 |
9 | 0.5466 | 0.6075 | 0.3545 | 0.5899 | 0.5397 | 0.2473 | 0.2609 |
Mean | 0.5637 | 0.5521 | 0.3606 | 0.6063 | 0.5568 | 0.2683 | 0.2828 |
SD | 0.0099 | 0.1841 | 0.0163 | 0.0110 | 0.0087 | 0.0137 | 0.0153 |
print(tuned_et)
ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced_subsample', criterion='gini', max_depth=10, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=7, min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(tuned_et, plot = 'auc')
plot_model(tuned_et, plot = 'pr')
plot_model(tuned_et, plot='feature')
plot_model(tuned_et, plot = 'confusion_matrix')
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
plot_model(tuned_et, plot = 'learning')
#plot_model(tuned_et, plot = 'threshold')
plot_model(tuned_et, plot = 'boundary')
plot_model(tuned_et, plot = 'error')
catboost = create_model('catboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9926 | 0.9985 | 0.5125 | 0.9870 | 0.9890 | 0.9869 | 0.9869 |
1 | 0.9930 | 0.9991 | 0.5434 | 0.9887 | 0.9898 | 0.9875 | 0.9876 |
2 | 0.9926 | 0.9987 | 0.5125 | 0.9870 | 0.9890 | 0.9869 | 0.9869 |
3 | 0.9930 | 0.9983 | 0.5375 | 0.9873 | 0.9898 | 0.9875 | 0.9876 |
4 | 0.9926 | 0.9990 | 0.5239 | 0.9870 | 0.9892 | 0.9869 | 0.9869 |
5 | 0.9922 | 0.9988 | 0.5102 | 0.9866 | 0.9887 | 0.9862 | 0.9863 |
6 | 0.9922 | 0.9989 | 0.5114 | 0.9864 | 0.9886 | 0.9862 | 0.9863 |
7 | 0.9926 | 0.9989 | 0.5250 | 0.9868 | 0.9891 | 0.9869 | 0.9869 |
8 | 0.9924 | 0.0000 | 0.5714 | 0.9849 | 0.9886 | 0.9865 | 0.9866 |
9 | 0.9930 | 0.9994 | 0.5375 | 0.9873 | 0.9898 | 0.9875 | 0.9876 |
Mean | 0.9926 | 0.8990 | 0.5285 | 0.9869 | 0.9892 | 0.9869 | 0.9870 |
SD | 0.0003 | 0.2997 | 0.0184 | 0.0009 | 0.0005 | 0.0005 | 0.0005 |
tuned_catboost = tune_model(catboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9912 | 0.9994 | 0.5262 | 0.9898 | 0.9884 | 0.9845 | 0.9846 |
1 | 0.9912 | 0.9994 | 0.5128 | 0.9877 | 0.9880 | 0.9845 | 0.9846 |
2 | 0.9912 | 0.9995 | 0.4938 | 0.9837 | 0.9874 | 0.9845 | 0.9846 |
3 | 0.9916 | 0.9994 | 0.5182 | 0.9860 | 0.9883 | 0.9852 | 0.9852 |
4 | 0.9910 | 0.9998 | 0.5041 | 0.9855 | 0.9875 | 0.9842 | 0.9842 |
5 | 0.9910 | 0.9997 | 0.5030 | 0.9875 | 0.9877 | 0.9842 | 0.9842 |
6 | 0.9891 | 0.9993 | 0.4844 | 0.9814 | 0.9852 | 0.9808 | 0.9809 |
7 | 0.9916 | 0.9995 | 0.5216 | 0.9900 | 0.9887 | 0.9852 | 0.9852 |
8 | 0.9912 | 0.0000 | 0.5648 | 0.9839 | 0.9875 | 0.9845 | 0.9846 |
9 | 0.9918 | 0.9997 | 0.5079 | 0.9862 | 0.9883 | 0.9855 | 0.9856 |
Mean | 0.9911 | 0.8996 | 0.5137 | 0.9862 | 0.9877 | 0.9843 | 0.9844 |
SD | 0.0007 | 0.2999 | 0.0209 | 0.0026 | 0.0009 | 0.0013 | 0.0012 |
print(tuned_catboost)
<catboost.core.CatBoostClassifier object at 0x7fd09e59eb90>
plot_model(tuned_catboost, plot = 'auc')
plot_model(tuned_catboost, plot = 'pr')
plot_model(tuned_catboost, plot = 'feature')
plot_model(tuned_catboost, plot = 'confusion_matrix')
plot_model(tuned_catboost, plot = 'learning')
#plot_model(tuned_rf, plot = 'threshold')
plot_model(tuned_catboost, plot = 'boundary')
plot_model(tuned_catboost, plot = 'error')
predict_model(tuned_et);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Extra Trees Classifier | 0.5644 | 0.6145 | 0.3560 | 0.6168 | 0.5555 | 0.2723 | 0.2898 |
predict_model(tuned_catboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | CatBoost Classifier | 0.9926 | 0.9997 | 0.5231 | 0.9902 | 0.9899 | 0.9868 | 0.9869 |
final_catboost = finalize_model(tuned_catboost);
# Final model parameters for deployment
print(final_catboost)
<catboost.core.CatBoostClassifier object at 0x7fd09df5e2d0>
predict_model(final_catboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | CatBoost Classifier | 0.9979 | 1.0000 | 0.8021 | 0.9979 | 0.9976 | 0.9963 | 0.9963 |
unseen_predictions = predict_model(final_catboost, data=data_unseen)
unseen_predictions.head()
S1 | C1 | S2 | C2 | S3 | C3 | S4 | C4 | S5 | C5 | CLASS | Label | Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 4 | 3 | 13 | 2 | 13 | 2 | 1 | 3 | 6 | 1 | 1 | 0.9793 |
1 | 1 | 3 | 4 | 5 | 3 | 4 | 1 | 12 | 4 | 6 | 0 | 0 | 0.9806 |
2 | 2 | 6 | 4 | 11 | 2 | 3 | 4 | 9 | 1 | 7 | 0 | 0 | 0.9879 |
3 | 3 | 2 | 4 | 9 | 3 | 7 | 4 | 3 | 4 | 5 | 0 | 0 | 0.9874 |
4 | 4 | 7 | 3 | 12 | 1 | 13 | 1 | 9 | 2 | 6 | 0 | 0 | 0.9885 |
from pycaret.utils import check_metric
check_metric(unseen_predictions['CLASS'], unseen_predictions['Label'], metric = 'Accuracy')
0.9933
check_metric(unseen_predictions['CLASS'], unseen_predictions['Label'], metric = 'Recall')
0.5445
check_metric(unseen_predictions['CLASS'], unseen_predictions['Label'], metric = 'Precision')
0.9918
#check_metric(unseen_predictions['CLASS'], unseen_predictions['Label'], metric = 'AUC')
check_metric(unseen_predictions['CLASS'], unseen_predictions['Label'], metric = 'F1')
0.9905
save_model(final_catboost,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='CLASS', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_strate... ('binn', 'passthrough'), ('rem_outliers', 'passthrough'), ('cluster_all', 'passthrough'), ('dummy', Dummify(target='CLASS')), ('fix_perfect', Remove_100(target='CLASS')), ('clean_names', Clean_Colum_Names()), ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', <catboost.core.CatBoostClassifier object at 0x7fd09df5e2d0>]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
Transformation Pipeline and Model Successfully Loaded
new_prediction = predict_model(load_saved_model, data=data_unseen)
#new_prediction[["Label", "Score"]].head(10)
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'poker'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
S1 | C1 | S2 | C2 | S3 | C3 | S4 | C4 | S5 | C5 | CLASS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 13 | 2 | 4 | 2 | 3 | 1 | 12 | 0 |
1 | 3 | 12 | 3 | 2 | 3 | 11 | 4 | 5 | 2 | 5 | 1 |
2 | 1 | 9 | 4 | 6 | 1 | 4 | 3 | 2 | 3 | 9 | 1 |
3 | 1 | 4 | 3 | 13 | 2 | 13 | 2 | 1 | 3 | 6 | 1 |
4 | 3 | 10 | 2 | 7 | 1 | 2 | 2 | 11 | 4 | 9 | 0 |
Data for Modeling: (74998, 11) Unseen Data For Predictions: (25000, 11)
clf = setup(data = data, target = 'CLASS', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | CLASS |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (74998, 11) |
5 | Missing Values | False |
6 | Numeric Features | 0 |
7 | Categorical Features | 10 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (52498, 85) |
12 | Transformed Test Set | (22500, 85) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 1ece |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
catboost | CatBoost Classifier | 0.9925 | 0.8989 | 0.5256 | 0.9865 | 0.9891 | 0.9868 | 0.9868 | 26.0210 |
xgboost | Extreme Gradient Boosting | 0.9317 | 0.8714 | 0.3516 | 0.9085 | 0.9080 | 0.8743 | 0.8792 | 42.4510 |
lightgbm | Light Gradient Boosting Machine | 0.8748 | 0.8296 | 0.2825 | 0.8543 | 0.8482 | 0.7684 | 0.7733 | 1.3810 |
et | Extra Trees Classifier | 0.7840 | 0.8187 | 0.2146 | 0.7594 | 0.7491 | 0.5922 | 0.6037 | 5.1510 |
gbc | Gradient Boosting Classifier | 0.7701 | 0.8344 | 0.2447 | 0.7383 | 0.7347 | 0.5638 | 0.5892 | 27.9150 |
rf | Random Forest Classifier | 0.7585 | 0.8083 | 0.2050 | 0.7122 | 0.7213 | 0.5420 | 0.5583 | 3.8610 |
dt | Decision Tree Classifier | 0.5756 | 0.5698 | 0.2322 | 0.5779 | 0.5766 | 0.2543 | 0.2544 | 0.2890 |
knn | K Neighbors Classifier | 0.5464 | 0.5612 | 0.1553 | 0.5151 | 0.5190 | 0.1433 | 0.1468 | 42.0790 |
dummy | Dummy Classifier | 0.5020 | 0.4500 | 0.1268 | 0.2520 | 0.3355 | 0.0000 | 0.0000 | 0.0270 |
svm | SVM - Linear Kernel | 0.5009 | 0.0000 | 0.1270 | 0.4341 | 0.3490 | 0.0016 | 0.0044 | 0.8280 |
lr | Logistic Regression | 0.5008 | 0.4489 | 0.1267 | 0.4293 | 0.3441 | -0.0004 | -0.0012 | 4.5230 |
lda | Linear Discriminant Analysis | 0.5008 | 0.4489 | 0.1267 | 0.4299 | 0.3442 | -0.0003 | -0.0010 | 0.3260 |
ridge | Ridge Classifier | 0.5007 | 0.0000 | 0.1267 | 0.4278 | 0.3436 | -0.0006 | -0.0020 | 0.0510 |
ada | Ada Boost Classifier | 0.3190 | 0.4500 | 0.0831 | 0.4065 | 0.3166 | 0.0004 | 0.0005 | 1.0230 |
qda | Quadratic Discriminant Analysis | 0.2879 | 0.4520 | 0.1144 | 0.3920 | 0.3189 | 0.0029 | 0.0032 | 0.2590 |
nb | Naive Bayes | 0.2682 | 0.4542 | 0.1590 | 0.4335 | 0.3202 | 0.0069 | 0.0078 | 0.0590 |
top_models
[<catboost.core.CatBoostClassifier at 0x7fc65c09b450>, XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, objective='multi:softprob', random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='auto', validate_parameters=1, verbosity=0), LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=1234, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
tuned_top_models
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
bagged_top_models
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
plot_model(best2, plot = 'auc')
plot_model(best2, plot = 'confusion_matrix')
plot_model(best2, plot = 'learning')
save_model(best2,'Final_Model')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.