# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from pycaret.classification import *


# provide the dataset name as shown in pycaret
whichDataset = 'income'


from pycaret.datasets import get_data
dataset = get_data(whichDataset)


dataset.shape

(32561, 14)


dataset.columns.to_list()

['age',
 'workclass',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income >50K']


data = dataset.sample(frac=0.75, random_state=1234)

data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (24421, 14)
Unseen Data For Predictions: (8140, 14)


env_setup = setup(data = data, target = 'income >50K', session_id=1234)


import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# --------------------------------------
best_model = compare_models()
# --------------------------------------


catboost = create_model('catboost')


tuned_catboost = tune_model(catboost)


print(tuned_catboost)

<catboost.core.CatBoostClassifier object at 0x7f91abedd0d0>


plot_model(tuned_catboost, plot = 'auc')


plot_model(tuned_catboost, plot = 'pr')


plot_model(tuned_catboost, plot='feature')


plot_model(tuned_catboost, plot = 'confusion_matrix')

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif


plot_model(tuned_catboost, plot = 'learning')


plot_model(tuned_catboost, plot = 'threshold')


rf = create_model('rf')


tuned_rf = tune_model(rf)


print(tuned_rf)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={},
                       criterion='entropy', max_depth=9, max_features=1.0,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0005, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=130,
                       n_jobs=-1, oob_score=False, random_state=1234, verbose=0,
                       warm_start=False)


plot_model(tuned_rf, plot = 'auc')


plot_model(tuned_rf, plot = 'pr')


plot_model(tuned_rf, plot = 'feature')


plot_model(tuned_rf, plot = 'confusion_matrix')


plot_model(tuned_rf, plot = 'learning')


plot_model(tuned_rf, plot = 'threshold')


predict_model(tuned_catboost);


predict_model(tuned_rf);


final_catboost = finalize_model(tuned_catboost)


# Final model parameters for deployment
print(final_catboost)

<catboost.core.CatBoostClassifier object at 0x7f91a1d93e10>


predict_model(final_catboost);


unseen_predictions = predict_model(final_catboost, data=data_unseen)
unseen_predictions.head()


from pycaret.utils import check_metric
check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Accuracy')

0.8779


check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Recall')

0.6614


check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'Precision')

0.7831


check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'AUC')

0.8027


check_metric(unseen_predictions['income >50K'], unseen_predictions['Label'], metric = 'F1')

0.7171


save_model(final_catboost,'Final_Model')

Transformation Pipeline and Model Successfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='income >50K', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_s...
                 ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                 ('cluster_all', 'passthrough'),
                 ('dummy', Dummify(target='income >50K')),
                 ('fix_perfect', Remove_100(target='income >50K')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  <catboost.core.CatBoostClassifier object at 0x7f91a1d93e10>]],
          verbose=False),
 'Final_Model.pkl')


load_saved_model = load_model('Final_Model')

Transformation Pipeline and Model Successfully Loaded


new_prediction = predict_model(load_saved_model, data=data_unseen)


new_prediction[["Label", "Score"]].head(10)


import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from pycaret.classification import *


# provide the dataset name as shown in pycaret
whichDataset = 'income'


from pycaret.datasets import get_data
dataset = get_data(whichDataset)

data = dataset.sample(frac=0.75, random_state=421)

data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (24421, 14)
Unseen Data For Predictions: (8140, 14)


clf = setup(data = data, target = 'income >50K', session_id=1234)


import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# compare all baseline models and select top 5
top_models = compare_models(n_select = 5)


top_models

[<catboost.core.CatBoostClassifier at 0x7f0c551cc350>,
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1,
               objective='binary:logistic', random_state=1234, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto',
               validate_parameters=1, verbosity=0),
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=1234, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=1234, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=1234)]


# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]


tuned_top_models

[<catboost.core.CatBoostClassifier at 0x7f0c55198d90>,
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.2, max_delta_step=0, max_depth=11,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=280, n_jobs=-1, num_parallel_tree=1,
               objective='binary:logistic', random_state=1234, reg_alpha=1e-06,
               reg_lambda=0.05, scale_pos_weight=46.6, subsample=0.7,
               tree_method='auto', validate_parameters=1, verbosity=0),
 LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=1.0,
                importance_type='split', learning_rate=0.3, max_depth=-1,
                min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3,
                n_estimators=190, n_jobs=-1, num_leaves=20, objective=None,
                random_state=1234, reg_alpha=0.15, reg_lambda=0.0001,
                silent=True, subsample=1.0, subsample_for_bin=200000,
                subsample_freq=0),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.4, loss='deviance', max_depth=2,
                            max_features=1.0, max_leaf_nodes=None,
                            min_impurity_decrease=0.2, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=5,
                            min_weight_fraction_leaf=0.0, n_estimators=200,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=1234, subsample=0.7, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5,
                    n_estimators=60, random_state=1234)]


# ensemble top tuned models
bagged_top_models = [ensemble_model(i) for i in tuned_top_models]


bagged_top_models

[BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f0c549255d0>,
                   bootstrap=True, bootstrap_features=False, max_features=1.0,
                   max_samples=1.0, n_estimators=10, n_jobs=None,
                   oob_score=False, random_state=1234, verbose=0,
                   warm_start=False),
 BaggingClassifier(base_estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                                colsample_bylevel=1,
                                                colsample_bynode=1,
                                                colsample_bytree=0.9, gamma=0,
                                                gpu_id=-1,
                                                importance_type='gain',
                                                interaction_constraints='',
                                                learning_rate=0.2,
                                                max_delta_step=0, max_depth=11,
                                                min_child_weight=1, missing=nan,
                                                monotone_constraints='()',
                                                n_estimators=280, n_jobs=-1,
                                                num_parallel_tree=1,
                                                objective='binary:logistic',
                                                random_state=1234,
                                                reg_alpha=1e-06, reg_lambda=0.05,
                                                scale_pos_weight=46.6,
                                                subsample=0.7,
                                                tree_method='auto',
                                                validate_parameters=1,
                                                verbosity=0),
                   bootstrap=True, bootstrap_features=False, max_features=1.0,
                   max_samples=1.0, n_estimators=10, n_jobs=None,
                   oob_score=False, random_state=1234, verbose=0,
                   warm_start=False),
 BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9,
                                                 bagging_freq=0,
                                                 boosting_type='gbdt',
                                                 class_weight=None,
                                                 colsample_bytree=1.0,
                                                 feature_fraction=1.0,
                                                 importance_type='split',
                                                 learning_rate=0.3, max_depth=-1,
                                                 min_child_samples=61,
                                                 min_child_weight=0.001,
                                                 min_split_gain=0.3,
                                                 n_estimators=190, n_jobs=-1,
                                                 num_leaves=20, objective=None,
                                                 random_state=1234,
                                                 reg_alpha=0.15,
                                                 reg_lambda=0.0001, silent=True,
                                                 subsample=1.0,
                                                 subsample_for_bin=200000,
                                                 subsample_freq=0),
                   bootstrap=True, bootstrap_features=False, max_features=1.0,
                   max_samples=1.0, n_estimators=10, n_jobs=None,
                   oob_score=False, random_state=1234, verbose=0,
                   warm_start=False),
 BaggingClassifier(base_estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                             criterion='friedman_mse',
                                                             init=None,
                                                             learning_rate=0.4,
                                                             loss='deviance',
                                                             max_depth=2,
                                                             max_features=1.0,
                                                             max_leaf_nodes=None,
                                                             min_impurity_decrease=0.2,
                                                             min_impurity_split=None,
                                                             min_samples_leaf=1,
                                                             min_samples_split=5,
                                                             min_weight_fraction_leaf=0.0,
                                                             n_estimators=200,
                                                             n_iter_no_change=None,
                                                             presort='deprecated',
                                                             random_state=1234,
                                                             subsample=0.7,
                                                             tol=0.0001,
                                                             validation_fraction=0.1,
                                                             verbose=0,
                                                             warm_start=False),
                   bootstrap=True, bootstrap_features=False, max_features=1.0,
                   max_samples=1.0, n_estimators=10, n_jobs=None,
                   oob_score=False, random_state=1234, verbose=0,
                   warm_start=False),
 BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                     base_estimator=None,
                                                     learning_rate=0.5,
                                                     n_estimators=60,
                                                     random_state=1234),
                   bootstrap=True, bootstrap_features=False, max_features=1.0,
                   max_samples=1.0, n_estimators=10, n_jobs=None,
                   oob_score=False, random_state=1234, verbose=0,
                   warm_start=False)]


# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')


print(); print("Best model based on AUC: ");       print(best1)
print(); print("Best model based on Accuracy: ");  print(best2)
print(); print("Best model based on Recall: ");    print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: ");        print(best5)

Best model based on AUC: 
<catboost.core.CatBoostClassifier object at 0x7f0c55218850>

Best model based on Accuracy: 
BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9,
                                                bagging_freq=0,
                                                boosting_type='gbdt',
                                                class_weight=None,
                                                colsample_bytree=1.0,
                                                feature_fraction=1.0,
                                                importance_type='split',
                                                learning_rate=0.3, max_depth=-1,
                                                min_child_samples=61,
                                                min_child_weight=0.001,
                                                min_split_gain=0.3,
                                                n_estimators=190, n_jobs=-1,
                                                num_leaves=20, objective=None,
                                                random_state=1234,
                                                reg_alpha=0.15,
                                                reg_lambda=0.0001, silent=True,
                                                subsample=1.0,
                                                subsample_for_bin=200000,
                                                subsample_freq=0),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=10, n_jobs=None,
                  oob_score=False, random_state=1234, verbose=0,
                  warm_start=False)

Best model based on Recall: 
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

Best model based on Precision: 
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1234, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Best model based on F1: 
BaggingClassifier(base_estimator=LGBMClassifier(bagging_fraction=0.9,
                                                bagging_freq=0,
                                                boosting_type='gbdt',
                                                class_weight=None,
                                                colsample_bytree=1.0,
                                                feature_fraction=1.0,
                                                importance_type='split',
                                                learning_rate=0.3, max_depth=-1,
                                                min_child_samples=61,
                                                min_child_weight=0.001,
                                                min_split_gain=0.3,
                                                n_estimators=190, n_jobs=-1,
                                                num_leaves=20, objective=None,
                                                random_state=1234,
                                                reg_alpha=0.15,
                                                reg_lambda=0.0001, silent=True,
                                                subsample=1.0,
                                                subsample_for_bin=200000,
                                                subsample_freq=0),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=10, n_jobs=None,
                  oob_score=False, random_state=1234, verbose=0,
                  warm_start=False)


plot_model(best2, plot = 'auc')


plot_model(best2, plot = 'confusion_matrix')


plot_model(best2, plot = 'learning')


save_model(best2,'Final_Model')

Transformation Pipeline and Model Successfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='income >50K', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_s...
                                                                  n_estimators=190,
                                                                  n_jobs=-1,
                                                                  num_leaves=20,
                                                                  objective=None,
                                                                  random_state=1234,
                                                                  reg_alpha=0.15,
                                                                  reg_lambda=0.0001,
                                                                  silent=True,
                                                                  subsample=1.0,
                                                                  subsample_for_bin=200000,
                                                                  subsample_freq=0),
                                    bootstrap=True, bootstrap_features=False,
                                    max_features=1.0, max_samples=1.0,
                                    n_estimators=10, n_jobs=None,
                                    oob_score=False, random_state=1234,
                                    verbose=0, warm_start=False)]],
          verbose=False),
 'Final_Model.pkl')


load_saved_model = load_model('Final_Model')

new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction[["Label", "Score"]].head()

Transformation Pipeline and Model Successfully Loaded

	age	workclass	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country
0	39	State-gov	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States
1	50	Self-emp-not-inc	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States
2	38	Private	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States
3	53	Private	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States
4	28	Private	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba

	Description	Value
0	session_id	1234
1	Target	income >50K
2	Target Type	Binary
3	Label Encoded	None
4	Original Data	(24421, 14)
5	Missing Values	True
6	Numeric Features	4
7	Categorical Features	9
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(17094, 104)
12	Transformed Test Set	(7327, 104)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	a69e
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Remove Perfect Collinearity	True
45	Clustering	False
46	Clustering Iteration	None
47	Polynomial Features	False
48	Polynomial Degree	None
49	Trignometry Features	False
50	Polynomial Threshold	None
51	Group Features	False
52	Feature Selection	False
53	Feature Selection Method	classic
54	Features Selection Threshold	None
55	Feature Interaction	False
56	Feature Ratio	False
57	Interaction Threshold	None
58	Fix Imbalance	False
59	Fix Imbalance Method	SMOTE

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lightgbm	Light Gradient Boosting Machine	0.8715	0.9253	0.6554	0.7776	0.7112	0.6293	0.6332	0.1170
catboost	CatBoost Classifier	0.8697	0.9250	0.6398	0.7808	0.7033	0.6208	0.6259	6.1470
xgboost	Extreme Gradient Boosting	0.8695	0.9237	0.6532	0.7714	0.7073	0.6240	0.6277	2.2300
gbc	Gradient Boosting Classifier	0.8640	0.9173	0.5785	0.8035	0.6726	0.5897	0.6024	1.0690
ada	Ada Boost Classifier	0.8598	0.9111	0.6146	0.7592	0.6791	0.5907	0.5962	0.3300
lr	Logistic Regression	0.8501	0.9029	0.5914	0.7360	0.6557	0.5613	0.5669	1.9980
rf	Random Forest Classifier	0.8470	0.8896	0.6193	0.7099	0.6613	0.5631	0.5654	0.9390
knn	K Neighbors Classifier	0.8435	0.8677	0.6246	0.6957	0.6581	0.5571	0.5585	0.9510
lda	Linear Discriminant Analysis	0.8418	0.8905	0.5601	0.7223	0.6308	0.5322	0.5392	0.1280
ridge	Ridge Classifier	0.8411	0.0000	0.5090	0.7530	0.6073	0.5123	0.5278	0.0280
et	Extra Trees Classifier	0.8240	0.8487	0.5824	0.6518	0.6150	0.5014	0.5028	1.1500
dt	Decision Tree Classifier	0.8196	0.7684	0.6217	0.6276	0.6244	0.5058	0.5059	0.0710
nb	Naive Bayes	0.8025	0.8925	0.8049	0.5637	0.6630	0.5294	0.5462	0.0270
svm	SVM - Linear Kernel	0.7759	0.0000	0.4636	0.5735	0.4643	0.3387	0.3674	0.0840
dummy	Dummy Classifier	0.7586	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0190
qda	Quadratic Discriminant Analysis	0.2492	0.5045	0.9981	0.2430	0.3909	0.0043	0.0408	0.0640

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8801	0.9299	0.6586	0.8095	0.7263	0.6506	0.6563
1	0.8725	0.9321	0.6392	0.7928	0.7078	0.6274	0.6334
2	0.8678	0.9228	0.6271	0.7825	0.6962	0.6131	0.6192
3	0.8632	0.9287	0.6320	0.7609	0.6905	0.6036	0.6079
4	0.8777	0.9391	0.6610	0.7982	0.7232	0.6456	0.6503
5	0.8578	0.9159	0.6271	0.7443	0.6807	0.5901	0.5937
6	0.8742	0.9217	0.6505	0.7906	0.7137	0.6341	0.6390
7	0.8555	0.9106	0.6068	0.7463	0.6693	0.5781	0.5832
8	0.8719	0.9224	0.6505	0.7813	0.7099	0.6286	0.6329
9	0.8760	0.9270	0.6456	0.8012	0.7151	0.6369	0.6430
Mean	0.8697	0.9250	0.6398	0.7808	0.7033	0.6208	0.6259
SD	0.0080	0.0078	0.0159	0.0217	0.0176	0.0227	0.0231

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8813	0.9291	0.6441	0.8261	0.7238	0.6497	0.6578
1	0.8743	0.9321	0.6441	0.7964	0.7122	0.6329	0.6387
2	0.8667	0.9198	0.6368	0.7713	0.6976	0.6131	0.6177
3	0.8608	0.9253	0.6295	0.7536	0.6860	0.5975	0.6015
4	0.8812	0.9400	0.6707	0.8052	0.7318	0.6564	0.6609
5	0.8619	0.9180	0.6199	0.7642	0.6845	0.5973	0.6027
6	0.8695	0.9197	0.6432	0.7771	0.7039	0.6211	0.6257
7	0.8537	0.9115	0.6092	0.7382	0.6676	0.5749	0.5792
8	0.8713	0.9207	0.6311	0.7927	0.7027	0.6219	0.6285
9	0.8771	0.9246	0.6553	0.7988	0.7200	0.6423	0.6474
Mean	0.8698	0.9241	0.6384	0.7824	0.7030	0.6207	0.6260
SD	0.0087	0.0077	0.0166	0.0250	0.0189	0.0244	0.0250

Application of PyCaret Machine Learning: Notebook in Python for Beginners to Professionals : [ Classification ]¶

PyCaret Machine Learning Project – A Guide to build a binary classification model in PyCaret using income dataset¶

PyCaret for Beginners - A Guide to a machine learning model in Python and PyCaret¶

Load Dataset for Modelling¶

Setup environment for modelling purposes¶

Compare the results of different available algorithms¶

Build few individual models¶

Applying "catboost"¶

Plot the outcomes of the trained model¶

Applying "rf"¶

Plot the outcomes of the trained model¶

Predict on test / hold-out Sample¶

Finalise Model¶

Predict on unseen data¶

Save model¶

Load model¶

PyCaret AutoML in Practice¶

Load Dataset¶

Select best model based om metric¶

Plots¶

Save model¶

Load model¶

Summary¶

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8602	0.8860	0.6271	0.7529	0.6843	0.5955	0.5996
1	0.8386	0.8968	0.6005	0.6908	0.6425	0.5389	0.5411
2	0.8351	0.8874	0.6029	0.6785	0.6385	0.5321	0.5337
3	0.8322	0.8905	0.5763	0.6800	0.6239	0.5168	0.5197
4	0.8543	0.9019	0.6271	0.7316	0.6754	0.5821	0.5850
5	0.8467	0.8865	0.6392	0.7003	0.6684	0.5689	0.5699
6	0.8572	0.8910	0.6553	0.7258	0.6888	0.5965	0.5978
7	0.8356	0.8743	0.5850	0.6866	0.6317	0.5267	0.5296
8	0.8537	0.8900	0.6505	0.7166	0.6819	0.5872	0.5884
9	0.8561	0.8918	0.6286	0.7358	0.6780	0.5861	0.5891
Mean	0.8470	0.8896	0.6193	0.7099	0.6613	0.5631	0.5654
SD	0.0101	0.0069	0.0256	0.0248	0.0232	0.0295	0.0295

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8614	0.8986	0.5617	0.8056	0.6619	0.5782	0.5931
1	0.8591	0.9047	0.5981	0.7671	0.6721	0.5841	0.5914
2	0.8480	0.8887	0.5690	0.7413	0.6438	0.5493	0.5571
3	0.8515	0.8958	0.5375	0.7789	0.6361	0.5467	0.5615
4	0.8537	0.9013	0.5860	0.7539	0.6594	0.5681	0.5754
5	0.8555	0.8893	0.5690	0.7730	0.6555	0.5667	0.5774
6	0.8525	0.8888	0.5607	0.7649	0.6471	0.5566	0.5674
7	0.8397	0.8760	0.4709	0.7760	0.5861	0.4940	0.5177
8	0.8537	0.8979	0.5898	0.7500	0.6603	0.5688	0.5755
9	0.8578	0.8910	0.5825	0.7717	0.6639	0.5760	0.5851
Mean	0.8533	0.8932	0.5625	0.7682	0.6486	0.5588	0.5702
SD	0.0059	0.0078	0.0347	0.0170	0.0231	0.0245	0.0208

	Label	Score
0	0	0.8994
1	1	0.7678
2	0	0.9691
3	0	0.9871
4	1	0.8254
5	0	0.9477
6	0	0.7862
7	0	0.9244
8	0	0.9631
9	0	0.9991

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
catboost	CatBoost Classifier	0.8712	0.9260	0.6455	0.7836	0.7077	0.6262	0.6311	6.2060
xgboost	Extreme Gradient Boosting	0.8688	0.9236	0.6523	0.7699	0.7060	0.6223	0.6260	2.2300
lightgbm	Light Gradient Boosting Machine	0.8686	0.9242	0.6494	0.7710	0.7048	0.6211	0.6250	0.1170
gbc	Gradient Boosting Classifier	0.8618	0.9176	0.5751	0.7967	0.6677	0.5834	0.5959	1.0560
ada	Ada Boost Classifier	0.8585	0.9119	0.6157	0.7540	0.6777	0.5883	0.5933	0.3240
lr	Logistic Regression	0.8514	0.9054	0.5995	0.7368	0.6609	0.5671	0.5722	2.0010
rf	Random Forest Classifier	0.8440	0.8894	0.6121	0.7038	0.6545	0.5544	0.5568	0.9190
lda	Linear Discriminant Analysis	0.8418	0.8922	0.5615	0.7221	0.6315	0.5328	0.5398	0.1260
ridge	Ridge Classifier	0.8404	0.0000	0.5063	0.7529	0.6050	0.5099	0.5259	0.0310
knn	K Neighbors Classifier	0.8392	0.8631	0.6143	0.6878	0.6487	0.5449	0.5466	0.9560
et	Extra Trees Classifier	0.8263	0.8468	0.5908	0.6563	0.6216	0.5093	0.5107	1.1180
dt	Decision Tree Classifier	0.8144	0.7571	0.6077	0.6182	0.6128	0.4908	0.4909	0.0730
nb	Naive Bayes	0.8017	0.8935	0.8090	0.5626	0.6635	0.5293	0.5470	0.0280
svm	SVM - Linear Kernel	0.7611	0.0000	0.6298	0.5487	0.5426	0.3946	0.4273	0.0990
dummy	Dummy Classifier	0.7584	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0180
qda	Quadratic Discriminant Analysis	0.2461	0.5024	0.9983	0.2425	0.3902	0.0023	0.0285	0.0660

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8515	0.9054	0.5496	0.7695	0.6412	0.5508	0.5632
1	0.8661	0.9158	0.5860	0.8067	0.6788	0.5969	0.6091
2	0.8561	0.9073	0.5714	0.7738	0.6574	0.5689	0.5794
3	0.8591	0.9164	0.5521	0.8028	0.6542	0.5695	0.5853
4	0.8555	0.9106	0.5787	0.7660	0.6593	0.5698	0.5789
5	0.8625	0.9148	0.5956	0.7834	0.6768	0.5915	0.6004
6	0.8555	0.9074	0.5787	0.7660	0.6593	0.5698	0.5789
7	0.8625	0.9150	0.6005	0.7799	0.6785	0.5929	0.6011
8	0.8502	0.9096	0.5593	0.7574	0.6435	0.5513	0.5615
9	0.8555	0.9140	0.5714	0.7712	0.6565	0.5675	0.5777
Mean	0.8574	0.9116	0.5743	0.7777	0.6605	0.5729	0.5835
SD	0.0048	0.0038	0.0163	0.0152	0.0129	0.0153	0.0150

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8538	0.9048	0.5545	0.7763	0.6469	0.5579	0.5705
1	0.8643	0.9146	0.5763	0.8068	0.6723	0.5897	0.6030
2	0.8491	0.9074	0.5593	0.7524	0.6417	0.5487	0.5584
3	0.8602	0.9183	0.5521	0.8085	0.6561	0.5723	0.5887
4	0.8566	0.9128	0.5738	0.7745	0.6592	0.5710	0.5813
5	0.8631	0.9156	0.5908	0.7896	0.6759	0.5914	0.6014
6	0.8537	0.9067	0.5690	0.7655	0.6528	0.5626	0.5726
7	0.8590	0.9148	0.5835	0.7774	0.6667	0.5795	0.5891
8	0.8479	0.9090	0.5569	0.7492	0.6389	0.5452	0.5548
9	0.8561	0.9129	0.5714	0.7738	0.6574	0.5689	0.5793
Mean	0.8564	0.9117	0.5688	0.7774	0.6568	0.5687	0.5799
SD	0.0052	0.0042	0.0123	0.0189	0.0117	0.0148	0.0155

	Label	Score
0	0	0.9502
1	0	0.9991
2	0	0.8660
3	0	0.9659
4	0	0.9990