# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from pycaret.classification import *


# OpenML Dataset ID
whichDataset = 13 # provide dataset id

import openml
from openml.datasets import get_dataset

dataset = openml.datasets.get_dataset(whichDataset)

# Print a summary
print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'" 
     )
print(f"URL: {dataset.url}")
print(dataset.description)

This is dataset 'breast-cancer', the target feature is 'Class'
URL: https://www.openml.org/data/v1/download/13/breast-cancer.arff
**Author**:   
**Source**: Unknown -   
**Please cite**:   

Citation Request:
    This breast cancer domain was obtained from the University Medical Centre,
    Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
    M. Soklic for providing the data.  Please include this citation if you plan
    to use this database.
 
 1. Title: Breast cancer data (Michalski has used this)
 
 2. Sources: 
    -- Matjaz Zwitter & Milan Soklic (physicians)
       Institute of Oncology 
       University Medical Center
       Ljubljana, Yugoslavia
    -- Donors: Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
    -- Date: 11 July 1988
 
 3. Past Usage: (Several: here are some)
      -- Michalski,R.S., Mozetic,I., Hong,J., & Lavrac,N. (1986). The 
         Multi-Purpose Incremental Learning System AQ15 and its Testing 
         Application to Three Medical Domains.  In Proceedings of the 
         Fifth National Conference on Artificial Intelligence, 1041-1045,
         Philadelphia, PA: Morgan Kaufmann.
         -- accuracy range: 66%-72%
      -- Clark,P. & Niblett,T. (1987). Induction in Noisy Domains.  In 
         Progress in Machine Learning (from the Proceedings of the 2nd
         European Working Session on Learning), 11-30, Bled, 
         Yugoslavia: Sigma Press.
         -- 8 test results given: 65%-72% accuracy range
      -- Tan, M., & Eshelman, L. (1988). Using weighted networks to 
         represent classification knowledge in noisy domains.  Proceedings 
         of the Fifth International Conference on Machine Learning, 121-134,
         Ann Arbor, MI.
         -- 4 systems tested: accuracy range was 68%-73.5%
     -- Cestnik,G., Konenenko,I, & Bratko,I. (1987). Assistant-86: A
        Knowledge-Elicitation Tool for Sophisticated Users.  In I.Bratko
        & N.Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press.
        -- Assistant-86: 78% accuracy
 
 4. Relevant Information:
      This is one of three domains provided by the Oncology Institute
      that has repeatedly appeared in the machine learning literature.
      (See also lymphography and primary-tumor.)
 
      This data set includes 201 instances of one class and 85 instances of
      another class.  The instances are described by 9 attributes, some of
      which are linear and some are nominal.
 
 5. Number of Instances: 286
 
 6. Number of Attributes: 9 + the class attribute
 
 7. Attribute Information:
    1. Class: no-recurrence-events, recurrence-events
    2. age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
    3. menopause: lt40, ge40, premeno.
    4. tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44,
                   45-49, 50-54, 55-59.
    5. inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26,
                  27-29, 30-32, 33-35, 36-39.
    6. node-caps: yes, no.
    7. deg-malig: 1, 2, 3.
    8. breast: left, right.
    9. breast-quad: left-up, left-low, right-up, right-low, central.
   10. irradiat: yes, no.
 
 8. Missing Attribute Values: (denoted by "?")
    Attribute #:  Number of instances with missing values:
    6.             8
    9.             1.
 
 9. Class Distribution:
     1. no-recurrence-events: 201 instances
     2. recurrence-events: 85 instances

 Num Instances:     286
 Num Attributes:    10
 Num Continuous:    0 (Int 0 / Real 0)
 Num Discrete:      10
 Missing values:    9 /  0.3%

     name                      type enum ints real     missing    distinct  (1)
   1 'age'                     Enum 100%   0%   0%     0 /  0%     6 /  2%   0% 
   2 'menopause'               Enum 100%   0%   0%     0 /  0%     3 /  1%   0% 
   3 'tumor-size'              Enum 100%   0%   0%     0 /  0%    11 /  4%   0% 
   4 'inv-nodes'               Enum 100%   0%   0%     0 /  0%     7 /  2%   0% 
   5 'node-caps'               Enum  97%   0%   0%     8 /  3%     2 /  1%   0% 
   6 'deg-malig'               Enum 100%   0%   0%     0 /  0%     3 /  1%   0% 
   7 'breast'                  Enum 100%   0%   0%     0 /  0%     2 /  1%   0% 
   8 'breast-quad'             Enum 100%   0%   0%     1 /  0%     5 /  2%   0% 
   9 'irradiat'                Enum 100%   0%   0%     0 /  0%     2 /  1%   0% 
  10 'Class'                   Enum 100%   0%   0%     0 /  0%     2 /  1%   0%


import warnings
warnings.filterwarnings("ignore")


X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute)

dataset = pd.DataFrame(X, columns=attribute_names)
dataset["target"] = y


dataset.shape

(286, 10)


dataset.head()


#dataset.columns.to_list()


# find missing values in data frame
dataset.isnull().sum().sum()

9


# replace missing values with 0
dataset = dataset.fillna(0)


dataset.isnull().sum().sum()

0


# group by 'target'
dataset.groupby('target').count()


# training and test data split
data = dataset.sample(frac=0.80, random_state=1234)

data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (229, 10)
Unseen Data For Predictions: (57, 10)


env_setup = setup(data = data, target = 'target', session_id=1234)


import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# --------------------------------------
best_model = compare_models(exclude = ['catboost', 'lda'], sort = 'Accuracy')
# --------------------------------------


model_1 = create_model('lightgbm')


tuned_model_1 = tune_model(model_1)


print(tuned_model_1)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=21, min_child_weight=0.001, min_split_gain=0.3,
               n_estimators=170, n_jobs=-1, num_leaves=2, objective=None,
               random_state=1234, reg_alpha=0.01, reg_lambda=3, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


tuned_model_1 = tune_model(model_1, n_iter=100)


print(tuned_model_1)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=6, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.5, max_depth=-1,
               min_child_samples=11, min_child_weight=0.001, min_split_gain=0.8,
               n_estimators=300, n_jobs=-1, num_leaves=20, objective=None,
               random_state=1234, reg_alpha=0.001, reg_lambda=5, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


plot_model(tuned_model_1, plot = 'auc')


plot_model(tuned_model_1, plot = 'pr')


plot_model(tuned_model_1, plot='feature')


plot_model(tuned_model_1, plot = 'confusion_matrix')


plot_model(tuned_model_1, plot = 'learning')


#plot_model(tuned_model_1, plot = 'threshold')


plot_model(tuned_model_1, plot = 'boundary')


plot_model(tuned_model_1, plot = 'error')


model_2 = create_model('lr')


tuned_model_2 = tune_model(model_2, n_iter=100)


print(tuned_model_2)

LogisticRegression(C=1.577, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


plot_model(tuned_model_2, plot = 'auc')


plot_model(tuned_model_2, plot = 'pr')


plot_model(tuned_model_2, plot = 'feature')


plot_model(tuned_model_2, plot = 'confusion_matrix')


plot_model(tuned_model_2, plot = 'learning')


#plot_model(tuned_model_2, plot = 'threshold')


plot_model(tuned_model_2, plot = 'boundary')


plot_model(tuned_model_2, plot = 'error')


predict_model(tuned_model_1);


predict_model(tuned_model_2);


final_model = finalize_model(tuned_model_1);

[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9
[LightGBM] [Warning] feature_fraction is set=0.4, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4


# Final model parameters for deployment
print(final_model)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=6, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.5, max_depth=-1,
               min_child_samples=11, min_child_weight=0.001, min_split_gain=0.8,
               n_estimators=300, n_jobs=-1, num_leaves=20, objective=None,
               random_state=1234, reg_alpha=0.001, reg_lambda=5, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


predict_model(final_model);


unseen_predictions = predict_model(final_model, data=data_unseen)
unseen_predictions.head()


from pycaret.utils import check_metric
check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Accuracy')

0.6316


check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Recall')

0.2632


check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'Precision')

0.4167


check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'AUC')

0.5395


check_metric(unseen_predictions['target'], unseen_predictions['Label'], metric = 'F1')

0.3226


save_model(final_model,'Final_Model')

Transformation Pipeline and Model Successfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='target',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                                 boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, feature_fraction=0.4,
                                 importance_type='split', learning_rate=0.5,
                                 max_depth=-1, min_child_samples=11,
                                 min_child_weight=0.001, min_split_gain=0.8,
                                 n_estimators=300, n_jobs=-1, num_leaves=20,
                                 objective=None, random_state=1234,
                                 reg_alpha=0.001, reg_lambda=5, silent=True,
                                 subsample=1.0, subsample_for_bin=200000,
                                 subsample_freq=0)]],
          verbose=False),
 'Final_Model.pkl')


load_saved_model = load_model('Final_Model')

Transformation Pipeline and Model Successfully Loaded


new_prediction = predict_model(load_saved_model, data=data_unseen)


#new_prediction[["Label", "Score"]].head()


import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from pycaret.classification import *


# OpenML Dataset ID
whichDataset = 13 # provide dataset id

import openml
from openml.datasets import get_dataset

dataset = openml.datasets.get_dataset(whichDataset)


X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute)

dataset = pd.DataFrame(X, columns=attribute_names)
dataset["target"] = y

data = dataset.sample(frac=0.70, random_state=421)

data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (200, 10)
Unseen Data For Predictions: (86, 10)


clf = setup(data = data, target = 'target', session_id=1234)


import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# compare all baseline models and select top 5
top_models = compare_models(n_select = 5, exclude = ['catboost', 'dummy'], sort = 'Accuracy')


top_models

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=1234, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                 max_iter=None, normalize=False, random_state=1234,
                 solver='auto', tol=0.001),
 LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                            solver='svd', store_covariance=False, tol=0.0001),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=1234)]


# tune top base models
tuned_top_models = [tune_model(i) for i in top_models]


tuned_top_models

[LogisticRegression(C=4.725000000000001, class_weight={}, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=36, min_child_weight=0.001, min_split_gain=0.1,
                n_estimators=30, n_jobs=-1, num_leaves=100, objective=None,
                random_state=1234, reg_alpha=0.005, reg_lambda=0.05, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 RidgeClassifier(alpha=8.2, class_weight=None, copy_X=True, fit_intercept=True,
                 max_iter=None, normalize=True, random_state=1234, solver='auto',
                 tol=0.001),
 LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=0.2,
                            solver='eigen', store_covariance=False, tol=0.0001),
 AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1e-06,
                    n_estimators=240, random_state=1234)]


# ensemble top tuned models
#bagged_top_models = [ensemble_model(i) for i in tuned_top_models]


#bagged_top_models


# select best model based on AUC
best1 = automl(optimize = 'AUC')
best2 = automl(optimize = 'Accuracy')
best3 = automl(optimize = 'Recall')
best4 = automl(optimize = 'Precision')
best5 = automl(optimize = 'F1')


print(); print("Best model based on AUC: ");       print(best1)
print(); print("Best model based on Accuracy: ");  print(best2)
print(); print("Best model based on Recall: ");    print(best3)
print(); print("Best model based on Precision: "); print(best4)
print(); print("Best model based on F1: ");        print(best5)

Best model based on AUC: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Best model based on Accuracy: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Best model based on Recall: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=1234)

Best model based on Precision: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Best model based on F1: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=1234)


plot_model(best2, plot = 'auc')


plot_model(best2, plot = 'confusion_matrix')


plot_model(best2, plot = 'learning')


save_model(best2,'Final_Model')

Transformation Pipeline and Model Successfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='target',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=1000,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l2', random_state=1234,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False)]],
          verbose=False),
 'Final_Model.pkl')


load_saved_model = load_model('Final_Model')

new_prediction = predict_model(load_saved_model, data=data_unseen)
new_prediction.head()

Transformation Pipeline and Model Successfully Loaded

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lr	Logistic Regression	0.7688	0.7633	0.4550	0.6367	0.5101	0.3866	0.4082	0.2490
et	Extra Trees Classifier	0.7625	0.7724	0.5350	0.6633	0.5754	0.4164	0.4332	0.2450
lightgbm	Light Gradient Boosting Machine	0.7625	0.7047	0.4700	0.6483	0.5231	0.3809	0.3992	0.0260
nb	Naive Bayes	0.7500	0.7703	0.5150	0.6642	0.5329	0.3776	0.4094	0.0060
ridge	Ridge Classifier	0.7500	0.0000	0.4350	0.5933	0.4765	0.3423	0.3637	0.0290
rf	Random Forest Classifier	0.7500	0.7529	0.5150	0.6350	0.5506	0.3848	0.4005	0.2640
knn	K Neighbors Classifier	0.7375	0.6466	0.3100	0.5917	0.3929	0.2681	0.2950	0.0670
xgboost	Extreme Gradient Boosting	0.7375	0.6933	0.4900	0.6302	0.5305	0.3577	0.3758	0.0590
qda	Quadratic Discriminant Analysis	0.7312	0.7117	0.5150	0.6125	0.5287	0.3502	0.3729	0.0160
gbc	Gradient Boosting Classifier	0.7250	0.6776	0.4750	0.5655	0.5017	0.3196	0.3308	0.0260
dt	Decision Tree Classifier	0.7000	0.6441	0.4950	0.5667	0.4945	0.2887	0.3120	0.0050
ada	Ada Boost Classifier	0.6938	0.6833	0.4350	0.5994	0.4627	0.2636	0.2928	0.0360
dummy	Dummy Classifier	0.6938	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0040
svm	SVM - Linear Kernel	0.6875	0.0000	0.4350	0.3609	0.3551	0.2102	0.2363	0.0060

		Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Split	Fold
CV-Val	0	0.8125	0.8182	0.4000	1.0000	0.5714	0.4783	0.5606
	1	0.8125	0.7455	0.6000	0.7500	0.6667	0.5385	0.5449
	2	0.8125	0.9455	0.8000	0.6667	0.7273	0.5862	0.5919
	3	0.6250	0.6000	0.2000	0.3333	0.2500	0.0204	0.0216
	4	0.7500	0.5818	0.4000	0.6667	0.5000	0.3469	0.3671
	5	0.7500	0.8182	0.4000	0.6667	0.5000	0.3469	0.3671
	6	0.6875	0.4364	0.2000	0.5000	0.2857	0.1304	0.1529
	7	1.0000	1.0000	1.0000	1.0000	1.0000	1.0000	1.0000
	8	0.6875	0.3727	0.2000	0.5000	0.2857	0.1304	0.1529
	9	0.6875	0.7292	0.5000	0.4000	0.4444	0.2308	0.2335
	Mean	0.7625	0.7047	0.4700	0.6483	0.5231	0.3809	0.3992
	Std	0.1000	0.1953	0.2532	0.2153	0.2206	0.2720	0.2718
Train	nan	0.8938	0.9421	0.7143	0.9211	0.8046	0.7332	0.7444

		Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Split	Fold
CV-Val	0	0.8750	0.8182	0.6000	1.0000	0.7500	0.6735	0.7125
	1	0.7500	0.7818	0.4000	0.6667	0.5000	0.3469	0.3671
	2	0.8125	0.9091	0.8000	0.6667	0.7273	0.5862	0.5919
	3	0.7500	0.7636	0.6000	0.6000	0.6000	0.4182	0.4182
	4	0.6250	0.6273	0.0000	0.0000	0.0000	-0.1163	-0.1741
	5	0.7500	0.9091	0.4000	0.6667	0.5000	0.3469	0.3671
	6	0.6875	0.4727	0.2000	0.5000	0.2857	0.1304	0.1529
	7	0.8125	1.0000	0.4000	1.0000	0.5714	0.4783	0.5606
	8	0.6250	0.4909	0.0000	0.0000	0.0000	-0.1163	-0.1741
	9	0.8125	0.8333	0.7500	0.6000	0.6667	0.5385	0.5449
	Mean	0.7500	0.7606	0.4150	0.5700	0.4601	0.3286	0.3367
	Std	0.0791	0.1685	0.2684	0.3247	0.2621	0.2637	0.2939
Train	nan	0.7875	0.8149	0.4490	0.7586	0.5641	0.4356	0.4618

		Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Split	Fold
CV-Val	0	0.8750	0.8182	0.6000	1.0000	0.7500	0.6735	0.7125
	1	0.7500	0.7636	0.4000	0.6667	0.5000	0.3469	0.3671
	2	0.8125	0.9091	0.8000	0.6667	0.7273	0.5862	0.5919
	3	0.6875	0.7909	0.4000	0.5000	0.4444	0.2308	0.2335
	4	0.7500	0.6818	0.2000	1.0000	0.3333	0.2558	0.3830
	5	0.7500	0.8000	0.4000	0.6667	0.5000	0.3469	0.3671
	6	0.6875	0.4455	0.0000	0.0000	0.0000	0.0000	0.0000
	7	0.8750	1.0000	0.6000	1.0000	0.7500	0.6735	0.7125
	8	0.6875	0.3636	0.2000	0.5000	0.2857	0.1304	0.1529
	9	0.8750	0.8437	0.7500	0.7500	0.7500	0.6667	0.6667
	Mean	0.7750	0.7416	0.4350	0.6750	0.5041	0.3911	0.4187
	Std	0.0750	0.1875	0.2430	0.2898	0.2379	0.2329	0.2347
Train	nan	0.8000	0.8212	0.4694	0.7931	0.5897	0.4688	0.4970

		Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Split	Fold
CV-Val	0	0.8125	0.8182	0.4000	1.0000	0.5714	0.4783	0.5606
	1	0.7500	0.7636	0.4000	0.6667	0.5000	0.3469	0.3671
	2	0.8125	0.9273	0.8000	0.6667	0.7273	0.5862	0.5919
	3	0.7500	0.7455	0.6000	0.6000	0.6000	0.4182	0.4182
	4	0.6875	0.6545	0.2000	0.5000	0.2857	0.1304	0.1529
	5	0.7500	0.9091	0.4000	0.6667	0.5000	0.3469	0.3671
	6	0.6875	0.4000	0.0000	0.0000	0.0000	0.0000	0.0000
	7	0.8750	0.9636	0.6000	1.0000	0.7500	0.6735	0.7125
	8	0.7500	0.6182	0.4000	0.6667	0.5000	0.3469	0.3671
	9	0.8125	0.8333	0.7500	0.6000	0.6667	0.5385	0.5449
	Mean	0.7688	0.7633	0.4550	0.6367	0.5101	0.3866	0.4082
	Std	0.0562	0.1617	0.2307	0.2635	0.2126	0.1935	0.2013
Train	nan	0.8000	0.7919	0.5102	0.7576	0.6098	0.4821	0.4991

Application of PyCaret Machine Learning: Notebook in Python for Beginners to Professionals : [ Classification ]¶

PyCaret Machine Learning Project – A Guide to build a binary classification model in PyCaret using OpenML cancer dataset¶

PyCaret for Beginners - A Guide to a machine learning model in Python and PyCaret¶

Load Dataset for Modelling¶

Setup environment for modelling purposes¶

Compare the results of different available algorithms¶

Build few individual models¶

Applying model 1¶

Plot the outcomes of the trained model¶

Applying "model 2"¶

Plot the outcomes of the trained model¶

Predict on test / hold-out Sample¶

Finalise Model¶

Predict on unseen data¶

Save model¶

Load model¶

PyCaret AutoML in Practice¶

Load Dataset¶

Select best model based om metric¶

Plots¶

Save model¶

Load model¶

Summary¶

	age	menopause	tumor-size	inv-nodes	node-caps	deg-malig	breast	breast-quad	irradiat	target
0	3.0	2.0	3.0	0.0	0.0	2.0	1.0	0.0	1.0	1
1	4.0	1.0	3.0	0.0	1.0	0.0	1.0	4.0	1.0	0
2	4.0	1.0	7.0	0.0	1.0	1.0	0.0	1.0	1.0	1
3	3.0	2.0	7.0	0.0	0.0	2.0	1.0	1.0	0.0	0
4	3.0	2.0	6.0	1.0	0.0	1.0	0.0	2.0	1.0	1

	Description	Value
0	session_id	1234
1	Target	target
2	Target Type	Binary
3	Label Encoded	None
4	Original Data	(229, 10)
5	Missing Values	False
6	Numeric Features	6
7	Categorical Features	3
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(160, 9)
12	Transformed Test Set	(69, 9)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	20f1
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Remove Perfect Collinearity	True
45	Clustering	False
46	Clustering Iteration	None
47	Polynomial Features	False
48	Polynomial Degree	None
49	Trignometry Features	False
50	Polynomial Threshold	None
51	Group Features	False
52	Feature Selection	False
53	Feature Selection Method	classic
54	Features Selection Threshold	None
55	Feature Interaction	False
56	Feature Ratio	False
57	Interaction Threshold	None
58	Fix Imbalance	False
59	Fix Imbalance Method	SMOTE

	age	menopause	tumor-size	inv-nodes	node-caps	deg-malig	breast	breast-quad	irradiat	target	Label	Score
0	3.0	2.0	7.0	0.0	0.0	2.0	1.0	1.0	0.0	0	1	0.6106
1	4.0	1.0	5.0	0.0	1.0	1.0	1.0	0.0	1.0	0	0	0.8585
2	3.0	2.0	5.0	0.0	1.0	1.0	0.0	1.0	0.0	1	0	0.7739
3	4.0	2.0	2.0	1.0	1.0	0.0	1.0	0.0	1.0	0	0	0.8229
4	5.0	1.0	6.0	0.0	1.0	2.0	1.0	4.0	1.0	1	0	0.6847

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lr	Logistic Regression	0.6841	0.6419	0.2450	0.5150	0.3082	0.1498	0.1814	0.0120
lightgbm	Light Gradient Boosting Machine	0.6780	0.6397	0.3750	0.4708	0.4006	0.1931	0.1993	0.0100
ridge	Ridge Classifier	0.6769	0.0000	0.2450	0.4983	0.3046	0.1371	0.1667	0.0050
lda	Linear Discriminant Analysis	0.6626	0.6225	0.2450	0.4983	0.3046	0.1176	0.1512	0.0060
ada	Ada Boost Classifier	0.6566	0.5611	0.4250	0.4504	0.4164	0.1847	0.1928	0.0360
knn	K Neighbors Classifier	0.6555	0.5596	0.2700	0.3983	0.3010	0.1071	0.1086	0.0670
nb	Naive Bayes	0.6495	0.6356	0.3700	0.4952	0.4025	0.1638	0.1795	0.0060
gbc	Gradient Boosting Classifier	0.6352	0.6100	0.3300	0.4167	0.3555	0.1073	0.1157	0.0270
svm	SVM - Linear Kernel	0.6126	0.0000	0.2550	0.2408	0.2173	0.0284	0.0327	0.0050
qda	Quadratic Discriminant Analysis	0.6060	0.5550	0.2600	0.4117	0.2933	0.0491	0.0602	0.0050
rf	Random Forest Classifier	0.5989	0.5301	0.2400	0.4383	0.2938	0.0267	0.0477	0.2630
et	Extra Trees Classifier	0.5835	0.4971	0.2400	0.3686	0.2655	-0.0086	0.0022	0.2520
xgboost	Extreme Gradient Boosting	0.5484	0.4606	0.3100	0.3358	0.3135	-0.0150	-0.0183	0.0590
dt	Decision Tree Classifier	0.5044	0.4314	0.2450	0.2583	0.2346	-0.1171	-0.1190	0.0050

	age	menopause	tumor-size	inv-nodes	node-caps	deg-malig	breast	breast-quad	irradiat	target	Score
0	4.0	1.0	7.0	0.0	1.0	1.0	0.0	1.0	1.0	1	0.7889
1	3.0	2.0	6.0	1.0	0.0	1.0	0.0	2.0	1.0	1	0.6857
2	4.0	2.0	5.0	1.0	1.0	1.0	1.0	0.0	0.0	0	0.5813
3	3.0	2.0	2.0	0.0	1.0	1.0	0.0	0.0	1.0	0	0.7987
4	4.0	2.0	5.0	0.0	1.0	1.0	0.0	1.0	1.0	0	0.7501

		Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Split	Fold
CV-Val	0	0.8571	0.8250	0.7500	0.7500	0.7500	0.6500	0.6500
	1	0.5714	0.5500	0.5000	0.3333	0.4000	0.0870	0.0913
	2	0.8571	0.8250	0.7500	0.7500	0.7500	0.6500	0.6500
	3	0.5714	0.4750	0.2500	0.2500	0.2500	-0.0500	-0.0500
	4	0.7143	0.6500	0.5000	0.5000	0.5000	0.3000	0.3000
	5	0.6429	0.5889	0.4000	0.5000	0.4444	0.1860	0.1886
	6	0.7857	0.7889	0.8000	0.6667	0.7273	0.5532	0.5594
	7	0.7143	0.6444	0.4000	0.6667	0.5000	0.3171	0.3373
	8	0.6429	0.5889	0.4000	0.5000	0.4444	0.1860	0.1886
	9	0.7692	0.7639	0.7500	0.6000	0.6667	0.4935	0.5007
	Mean	0.7126	0.6700	0.5500	0.5517	0.5433	0.3373	0.3416
	Std	0.1000	0.1173	0.1857	0.1594	0.1626	0.2295	0.2297
Train	nan	0.7122	0.6675	0.5455	0.5455	0.5455	0.3349	0.3349

	age	menopause	tumor-size	inv-nodes	node-caps	deg-malig	breast	breast-quad	irradiat	target
0	3.0	2.0	3.0	0.0	0.0	2.0	1.0	0.0	1.0	1
1	4.0	1.0	3.0	0.0	1.0	0.0	1.0	4.0	1.0	0
2	4.0	1.0	7.0	0.0	1.0	1.0	0.0	1.0	1.0	1
3	3.0	2.0	7.0	0.0	0.0	2.0	1.0	1.0	0.0	0
4	3.0	2.0	6.0	1.0	0.0	1.0	0.0	2.0	1.0	1

	age	menopause	tumor-size	inv-nodes	node-caps	deg-malig	breast	breast-quad	irradiat	target
0	3.0	2.0	3.0	0.0	0.0	2.0	1.0	0.0	1.0	1
1	4.0	1.0	3.0	0.0	1.0	0.0	1.0	4.0	1.0	0
2	4.0	1.0	7.0	0.0	1.0	1.0	0.0	1.0	1.0	1
3	3.0	2.0	7.0	0.0	0.0	2.0	1.0	1.0	0.0	0
4	3.0	2.0	6.0	1.0	0.0	1.0	0.0	2.0	1.0	1