For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'income'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income >50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | 0 |
1 | 50 | Self-emp-not-inc | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | 0 |
2 | 38 | Private | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | 0 |
3 | 53 | Private | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | 0 |
4 | 28 | Private | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | 0 |
dataset.shape
(32561, 14)
dataset.columns.to_list()
['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income >50K']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (24421, 14) Unseen Data For Predictions: (8140, 14)
env_setup = setup(data = data, target = 'income >50K', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | income >50K |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (24421, 14) |
5 | Missing Values | True |
6 | Numeric Features | 4 |
7 | Categorical Features | 9 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (17094, 104) |
12 | Transformed Test Set | (7327, 104) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | a69e |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lightgbm | Light Gradient Boosting Machine | 0.8715 | 0.9253 | 0.6554 | 0.7776 | 0.7112 | 0.6293 | 0.6332 | 0.1170 |
catboost | CatBoost Classifier | 0.8697 | 0.9250 | 0.6398 | 0.7808 | 0.7033 | 0.6208 | 0.6259 | 6.1470 |
xgboost | Extreme Gradient Boosting | 0.8695 | 0.9237 | 0.6532 | 0.7714 | 0.7073 | 0.6240 | 0.6277 | 2.2300 |
gbc | Gradient Boosting Classifier | 0.8640 | 0.9173 | 0.5785 | 0.8035 | 0.6726 | 0.5897 | 0.6024 | 1.0690 |
ada | Ada Boost Classifier | 0.8598 | 0.9111 | 0.6146 | 0.7592 | 0.6791 | 0.5907 | 0.5962 | 0.3300 |
lr | Logistic Regression | 0.8501 | 0.9029 | 0.5914 | 0.7360 | 0.6557 | 0.5613 | 0.5669 | 1.9980 |
rf | Random Forest Classifier | 0.8470 | 0.8896 | 0.6193 | 0.7099 | 0.6613 | 0.5631 | 0.5654 | 0.9390 |
knn | K Neighbors Classifier | 0.8435 | 0.8677 | 0.6246 | 0.6957 | 0.6581 | 0.5571 | 0.5585 | 0.9510 |
lda | Linear Discriminant Analysis | 0.8418 | 0.8905 | 0.5601 | 0.7223 | 0.6308 | 0.5322 | 0.5392 | 0.1280 |
ridge | Ridge Classifier | 0.8411 | 0.0000 | 0.5090 | 0.7530 | 0.6073 | 0.5123 | 0.5278 | 0.0280 |
et | Extra Trees Classifier | 0.8240 | 0.8487 | 0.5824 | 0.6518 | 0.6150 | 0.5014 | 0.5028 | 1.1500 |
dt | Decision Tree Classifier | 0.8196 | 0.7684 | 0.6217 | 0.6276 | 0.6244 | 0.5058 | 0.5059 | 0.0710 |
nb | Naive Bayes | 0.8025 | 0.8925 | 0.8049 | 0.5637 | 0.6630 | 0.5294 | 0.5462 | 0.0270 |
svm | SVM - Linear Kernel | 0.7759 | 0.0000 | 0.4636 | 0.5735 | 0.4643 | 0.3387 | 0.3674 | 0.0840 |
dummy | Dummy Classifier | 0.7586 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0190 |
qda | Quadratic Discriminant Analysis | 0.2492 | 0.5045 | 0.9981 | 0.2430 | 0.3909 | 0.0043 | 0.0408 | 0.0640 |
catboost = create_model('catboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8801 | 0.9299 | 0.6586 | 0.8095 | 0.7263 | 0.6506 | 0.6563 |
1 | 0.8725 | 0.9321 | 0.6392 | 0.7928 | 0.7078 | 0.6274 | 0.6334 |
2 | 0.8678 | 0.9228 | 0.6271 | 0.7825 | 0.6962 | 0.6131 | 0.6192 |
3 | 0.8632 | 0.9287 | 0.6320 | 0.7609 | 0.6905 | 0.6036 | 0.6079 |
4 | 0.8777 | 0.9391 | 0.6610 | 0.7982 | 0.7232 | 0.6456 | 0.6503 |
5 | 0.8578 | 0.9159 | 0.6271 | 0.7443 | 0.6807 | 0.5901 | 0.5937 |
6 | 0.8742 | 0.9217 | 0.6505 | 0.7906 | 0.7137 | 0.6341 | 0.6390 |
7 | 0.8555 | 0.9106 | 0.6068 | 0.7463 | 0.6693 | 0.5781 | 0.5832 |
8 | 0.8719 | 0.9224 | 0.6505 | 0.7813 | 0.7099 | 0.6286 | 0.6329 |
9 | 0.8760 | 0.9270 | 0.6456 | 0.8012 | 0.7151 | 0.6369 | 0.6430 |
Mean | 0.8697 | 0.9250 | 0.6398 | 0.7808 | 0.7033 | 0.6208 | 0.6259 |
SD | 0.0080 | 0.0078 | 0.0159 | 0.0217 | 0.0176 | 0.0227 | 0.0231 |
tuned_catboost = tune_model(catboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8813 | 0.9291 | 0.6441 | 0.8261 | 0.7238 | 0.6497 | 0.6578 |
1 | 0.8743 | 0.9321 | 0.6441 | 0.7964 | 0.7122 | 0.6329 | 0.6387 |
2 | 0.8667 | 0.9198 | 0.6368 | 0.7713 | 0.6976 | 0.6131 | 0.6177 |
3 | 0.8608 | 0.9253 | 0.6295 | 0.7536 | 0.6860 | 0.5975 | 0.6015 |
4 | 0.8812 | 0.9400 | 0.6707 | 0.8052 | 0.7318 | 0.6564 | 0.6609 |
5 | 0.8619 | 0.9180 | 0.6199 | 0.7642 | 0.6845 | 0.5973 | 0.6027 |
6 | 0.8695 | 0.9197 | 0.6432 | 0.7771 | 0.7039 | 0.6211 | 0.6257 |
7 | 0.8537 | 0.9115 | 0.6092 | 0.7382 | 0.6676 | 0.5749 | 0.5792 |
8 | 0.8713 | 0.9207 | 0.6311 | 0.7927 | 0.7027 | 0.6219 | 0.6285 |
9 | 0.8771 | 0.9246 | 0.6553 | 0.7988 | 0.7200 | 0.6423 | 0.6474 |
Mean | 0.8698 | 0.9241 | 0.6384 | 0.7824 | 0.7030 | 0.6207 | 0.6260 |
SD | 0.0087 | 0.0077 | 0.0166 | 0.0250 | 0.0189 | 0.0244 | 0.0250 |
print(tuned_catboost)
<catboost.core.CatBoostClassifier object at 0x7f91abedd0d0>
plot_model(tuned_catboost, plot = 'auc')
plot_model(tuned_catboost, plot = 'pr')
plot_model(tuned_catboost, plot='feature')
plot_model(tuned_catboost, plot = 'confusion_matrix')
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
plot_model(tuned_catboost, plot = 'learning')
plot_model(tuned_catboost, plot = 'threshold')
rf = create_model('rf')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8602 | 0.8860 | 0.6271 | 0.7529 | 0.6843 | 0.5955 | 0.5996 |
1 | 0.8386 | 0.8968 | 0.6005 | 0.6908 | 0.6425 | 0.5389 | 0.5411 |
2 | 0.8351 | 0.8874 | 0.6029 | 0.6785 | 0.6385 | 0.5321 | 0.5337 |
3 | 0.8322 | 0.8905 | 0.5763 | 0.6800 | 0.6239 | 0.5168 | 0.5197 |
4 | 0.8543 | 0.9019 | 0.6271 | 0.7316 | 0.6754 | 0.5821 | 0.5850 |
5 | 0.8467 | 0.8865 | 0.6392 | 0.7003 | 0.6684 | 0.5689 | 0.5699 |
6 | 0.8572 | 0.8910 | 0.6553 | 0.7258 | 0.6888 | 0.5965 | 0.5978 |
7 | 0.8356 | 0.8743 | 0.5850 | 0.6866 | 0.6317 | 0.5267 | 0.5296 |
8 | 0.8537 | 0.8900 | 0.6505 | 0.7166 | 0.6819 | 0.5872 | 0.5884 |
9 | 0.8561 | 0.8918 | 0.6286 | 0.7358 | 0.6780 | 0.5861 | 0.5891 |
Mean | 0.8470 | 0.8896 | 0.6193 | 0.7099 | 0.6613 | 0.5631 | 0.5654 |
SD | 0.0101 | 0.0069 | 0.0256 | 0.0248 | 0.0232 | 0.0295 | 0.0295 |
tuned_rf = tune_model(rf)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8614 | 0.8986 | 0.5617 | 0.8056 | 0.6619 | 0.5782 | 0.5931 |
1 | 0.8591 | 0.9047 | 0.5981 | 0.7671 | 0.6721 | 0.5841 | 0.5914 |
2 | 0.8480 | 0.8887 | 0.5690 | 0.7413 | 0.6438 | 0.5493 | 0.5571 |
3 | 0.8515 | 0.8958 | 0.5375 | 0.7789 | 0.6361 | 0.5467 | 0.5615 |
4 | 0.8537 | 0.9013 | 0.5860 | 0.7539 | 0.6594 | 0.5681 | 0.5754 |
5 | 0.8555 | 0.8893 | 0.5690 | 0.7730 | 0.6555 | 0.5667 | 0.5774 |
6 | 0.8525 | 0.8888 | 0.5607 | 0.7649 | 0.6471 | 0.5566 | 0.5674 |
7 | 0.8397 | 0.8760 | 0.4709 | 0.7760 | 0.5861 | 0.4940 | 0.5177 |
8 | 0.8537 | 0.8979 | 0.5898 | 0.7500 | 0.6603 | 0.5688 | 0.5755 |
9 | 0.8578 | 0.8910 | 0.5825 | 0.7717 | 0.6639 | 0.5760 | 0.5851 |
Mean | 0.8533 | 0.8932 | 0.5625 | 0.7682 | 0.6486 | 0.5588 | 0.5702 |
SD | 0.0059 | 0.0078 | 0.0347 | 0.0170 | 0.0231 | 0.0245 | 0.0208 |
print(tuned_rf)
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=9, max_features=1.0, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0005, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=130, n_jobs=-1, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(tuned_rf, plot = 'auc')
plot_model(tuned_rf, plot = 'pr')
plot_model(tuned_rf, plot = 'feature')
plot_model(tuned_rf, plot = 'confusion_matrix')
plot_model(tuned_rf, plot = 'learning')
plot_model(tuned_rf, plot = 'threshold')
predict_model(tuned_catboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | CatBoost Classifier | 0.8667 | 0.9254 | 0.6414 | 0.7797 | 0.7038 | 0.6189 | 0.6238 |
predict_model(tuned_rf);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Random Forest Classifier | 0.8478 | 0.8948 | 0.5807 | 0.7470 | 0.6534 | 0.5579 | 0.5651 |
final_catboost = finalize_model(tuned_catboost)
# Final model parameters for deployment
print(final_catboost)
<catboost.core.CatBoostClassifier object at 0x7f91a1d93e10>
predict_model(final_catboost);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | CatBoost Classifier | 0.8766 | 0.9364 | 0.6569 | 0.8077 | 0.7246 | 0.6461 | 0.6519 |
unseen_predictions = predict_model(final_catboost, data=data_unseen)
unseen_predictions.head()
age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income >50K | Label | Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53 | Private | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | 0 | 0 | 0.8994 |
1 | 37 | Private | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | 1 | 1 | 0.7678 |
2 | 34 | Private | 7th-8th | 4 | Married-civ-spouse | Transport-moving | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 45 | Mexico | 0 | 0 | 0.9691 |
3 | 32 | Private | HS-grad | 9 | Never-married | Machine-op-inspct | Unmarried | White | Male | 0 | 0 | 40 | United-States | 0 | 0 | 0.9871 |
4 | 40 | Private | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | 1 | 1 | 0.8254 |
from pycaret.utils import check_metric
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Accuracy')
0.8779
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Recall')
0.6614
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Precision')
0.7831
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'AUC')
0.8027
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'F1')
0.7171
save_model(final_catboost,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='income >50K', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_s... ('binn', 'passthrough'), ('rem_outliers', 'passthrough'), ('cluster_all', 'passthrough'), ('dummy', Dummify(target='income >50K')), ('fix_perfect', Remove_100(target='income >50K')), ('clean_names', Clean_Colum_Names()), ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', <catboost.core.CatBoostClassifier object at 0x7f91a1d93e10>]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
Transformation Pipeline and Model Successfully Loaded
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head(10)
Label | Score | |
---|---|---|
0 | 0 | 0.8994 |
1 | 1 | 0.7678 |
2 | 0 | 0.9691 |
3 | 0 | 0.9871 |
4 | 1 | 0.8254 |
5 | 0 | 0.9477 |
6 | 0 | 0.7862 |
7 | 0 | 0.9244 |
8 | 0 | 0.9631 |
9 | 0 | 0.9991 |
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'income'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
data = dataset.sample(frac=0.75, random_state=421)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income >50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | 0 |
1 | 50 | Self-emp-not-inc | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | 0 |
2 | 38 | Private | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | 0 |
3 | 53 | Private | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | 0 |
4 | 28 | Private | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | 0 |
Data for Modeling: (24421, 14) Unseen Data For Predictions: (8140, 14)
clf = setup(data = data, target = 'income >50K', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | income >50K |
2 | Target Type | Binary |
3 | Label Encoded | None |
4 | Original Data | (24421, 14) |
5 | Missing Values | True |
6 | Numeric Features | 4 |
7 | Categorical Features | 9 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (17094, 103) |
12 | Transformed Test Set | (7327, 103) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 90fd |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
catboost | CatBoost Classifier | 0.8712 | 0.9260 | 0.6455 | 0.7836 | 0.7077 | 0.6262 | 0.6311 | 6.2060 |
xgboost | Extreme Gradient Boosting | 0.8688 | 0.9236 | 0.6523 | 0.7699 | 0.7060 | 0.6223 | 0.6260 | 2.2300 |
lightgbm | Light Gradient Boosting Machine | 0.8686 | 0.9242 | 0.6494 | 0.7710 | 0.7048 | 0.6211 | 0.6250 | 0.1170 |
gbc | Gradient Boosting Classifier | 0.8618 | 0.9176 | 0.5751 | 0.7967 | 0.6677 | 0.5834 | 0.5959 | 1.0560 |
ada | Ada Boost Classifier | 0.8585 | 0.9119 | 0.6157 | 0.7540 | 0.6777 | 0.5883 | 0.5933 | 0.3240 |
lr | Logistic Regression | 0.8514 | 0.9054 | 0.5995 | 0.7368 | 0.6609 | 0.5671 | 0.5722 | 2.0010 |
rf | Random Forest Classifier | 0.8440 | 0.8894 | 0.6121 | 0.7038 | 0.6545 | 0.5544 | 0.5568 | 0.9190 |
lda | Linear Discriminant Analysis | 0.8418 | 0.8922 | 0.5615 | 0.7221 | 0.6315 | 0.5328 | 0.5398 | 0.1260 |
ridge | Ridge Classifier | 0.8404 | 0.0000 | 0.5063 | 0.7529 | 0.6050 | 0.5099 | 0.5259 | 0.0310 |
knn | K Neighbors Classifier | 0.8392 | 0.8631 | 0.6143 | 0.6878 | 0.6487 | 0.5449 | 0.5466 | 0.9560 |
et | Extra Trees Classifier | 0.8263 | 0.8468 | 0.5908 | 0.6563 | 0.6216 | 0.5093 | 0.5107 | 1.1180 |
dt | Decision Tree Classifier | 0.8144 | 0.7571 | 0.6077 | 0.6182 | 0.6128 | 0.4908 | 0.4909 | 0.0730 |
nb | Naive Bayes | 0.8017 | 0.8935 | 0.8090 | 0.5626 | 0.6635 | 0.5293 | 0.5470 | 0.0280 |
svm | SVM - Linear Kernel | 0.7611 | 0.0000 | 0.6298 | 0.5487 | 0.5426 | 0.3946 | 0.4273 | 0.0990 |
dummy | Dummy Classifier | 0.7584 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0180 |
qda | Quadratic Discriminant Analysis | 0.2461 | 0.5024 | 0.9983 | 0.2425 | 0.3902 | 0.0023 | 0.0285 | 0.0660 |
top_models
[<catboost.core.CatBoostClassifier at 0x7f0c551cc350>, XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto', validate_parameters=1, verbosity=0), LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=1234, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=50, random_state=1234)]
# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8515 | 0.9054 | 0.5496 | 0.7695 | 0.6412 | 0.5508 | 0.5632 |
1 | 0.8661 | 0.9158 | 0.5860 | 0.8067 | 0.6788 | 0.5969 | 0.6091 |
2 | 0.8561 | 0.9073 | 0.5714 | 0.7738 | 0.6574 | 0.5689 | 0.5794 |
3 | 0.8591 | 0.9164 | 0.5521 | 0.8028 | 0.6542 | 0.5695 | 0.5853 |
4 | 0.8555 | 0.9106 | 0.5787 | 0.7660 | 0.6593 | 0.5698 | 0.5789 |
5 | 0.8625 | 0.9148 | 0.5956 | 0.7834 | 0.6768 | 0.5915 | 0.6004 |
6 | 0.8555 | 0.9074 | 0.5787 | 0.7660 | 0.6593 | 0.5698 | 0.5789 |
7 | 0.8625 | 0.9150 | 0.6005 | 0.7799 | 0.6785 | 0.5929 | 0.6011 |
8 | 0.8502 | 0.9096 | 0.5593 | 0.7574 | 0.6435 | 0.5513 | 0.5615 |
9 | 0.8555 | 0.9140 | 0.5714 | 0.7712 | 0.6565 | 0.5675 | 0.5777 |
Mean | 0.8574 | 0.9116 | 0.5743 | 0.7777 | 0.6605 | 0.5729 | 0.5835 |
SD | 0.0048 | 0.0038 | 0.0163 | 0.0152 | 0.0129 | 0.0153 | 0.0150 |
tuned_top_models
[<catboost.core.CatBoostClassifier at 0x7f0c55198d90>, XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0), LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3, n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.4, loss='deviance', max_depth=2, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.2, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=200, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=0.7, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5, n_estimators=60, random_state=1234)]
# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8538 | 0.9048 | 0.5545 | 0.7763 | 0.6469 | 0.5579 | 0.5705 |
1 | 0.8643 | 0.9146 | 0.5763 | 0.8068 | 0.6723 | 0.5897 | 0.6030 |
2 | 0.8491 | 0.9074 | 0.5593 | 0.7524 | 0.6417 | 0.5487 | 0.5584 |
3 | 0.8602 | 0.9183 | 0.5521 | 0.8085 | 0.6561 | 0.5723 | 0.5887 |
4 | 0.8566 | 0.9128 | 0.5738 | 0.7745 | 0.6592 | 0.5710 | 0.5813 |
5 | 0.8631 | 0.9156 | 0.5908 | 0.7896 | 0.6759 | 0.5914 | 0.6014 |
6 | 0.8537 | 0.9067 | 0.5690 | 0.7655 | 0.6528 | 0.5626 | 0.5726 |
7 | 0.8590 | 0.9148 | 0.5835 | 0.7774 | 0.6667 | 0.5795 | 0.5891 |
8 | 0.8479 | 0.9090 | 0.5569 | 0.7492 | 0.6389 | 0.5452 | 0.5548 |
9 | 0.8561 | 0.9129 | 0.5714 | 0.7738 | 0.6574 | 0.5689 | 0.5793 |
Mean | 0.8564 | 0.9117 | 0.5688 | 0.7774 | 0.6568 | 0.5687 | 0.5799 |
SD | 0.0052 | 0.0042 | 0.0123 | 0.0189 | 0.0117 | 0.0148 | 0.0155 |
bagged_top_models
[BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f0c549255d0>, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.2, max_delta_step=0, max_depth=11, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=280, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=1234, reg_alpha=1e-06, reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7, tree_method='auto', validate_parameters=1, verbosity=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3, n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.4, loss='deviance', max_depth=2, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.2, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=200, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=0.7, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False), BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5, n_estimators=60, random_state=1234), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False)]
# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')
print(); print("Best model based on AUC: "); print(best1)
print(); print("Best model based on Accuracy: "); print(best2)
print(); print("Best model based on Recall: "); print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: "); print(best5)
Best model based on AUC: <catboost.core.CatBoostClassifier object at 0x7f0c55218850> Best model based on Accuracy: BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3, n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False) Best model based on Recall: QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0, store_covariance=False, tol=0.0001) Best model based on Precision: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='deprecated', random_state=1234, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False) Best model based on F1: BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3, n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False)
plot_model(best2, plot = 'auc')
plot_model(best2, plot = 'confusion_matrix')
plot_model(best2, plot = 'learning')
save_model(best2,'Final_Model')
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=[], target='income >50K', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_value_categorical=None, fill_value_numerical=None, numeric_s... n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1234, verbose=0, warm_start=False)]], verbose=False), 'Final_Model.pkl')
load_saved_model = load_model('Final_Model')
new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()
Transformation Pipeline and Model Successfully Loaded
Label | Score | |
---|---|---|
0 | 0 | 0.9502 |
1 | 0 | 0.9991 |
2 | 0 | 0.8660 |
3 | 0 | 0.9659 |
4 | 0 | 0.9990 |
In this coding recipe, we discussed how to build a machine learning model in Python using PyCaret.