For more projects visit: https://setscholars.net
# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pycaret.classification import *
# provide the dataset name as shown in pycaret
whichDataset = 'satellite'
from pycaret.datasets import get_data
dataset = get_data(whichDataset)
Attribute1 | Attribute2 | Attribute3 | Attribute4 | Attribute5 | Attribute6 | Attribute7 | Attribute8 | Attribute9 | Attribute10 | Attribute11 | Attribute12 | Attribute13 | Attribute14 | Attribute15 | Attribute16 | Attribute17 | Attribute18 | Attribute19 | Attribute20 | Attribute21 | Attribute22 | Attribute23 | Attribute24 | Attribute25 | Attribute26 | Attribute27 | Attribute28 | Attribute29 | Attribute30 | Attribute31 | Attribute32 | Attribute33 | Attribute34 | Attribute35 | Attribute36 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 80 | 102 | 102 | 79 | 76 | 102 | 102 | 79 | 76 | 102 | 106 | 83 | 76 | 99 | 108 | 85 | 76 | 103 | 118 | 88 | 80 | 107 | 118 | 88 | 79 | 107 | 109 | 87 | 79 | 107 | 109 | 87 | 79 | 107 | 113 | 87 | 3 |
1 | 76 | 102 | 102 | 79 | 76 | 102 | 106 | 83 | 76 | 102 | 106 | 87 | 76 | 103 | 118 | 88 | 80 | 107 | 118 | 88 | 80 | 112 | 118 | 88 | 79 | 107 | 109 | 87 | 79 | 107 | 113 | 87 | 79 | 103 | 104 | 83 | 3 |
2 | 80 | 98 | 106 | 79 | 76 | 94 | 102 | 76 | 76 | 94 | 102 | 76 | 80 | 107 | 113 | 85 | 80 | 95 | 100 | 78 | 80 | 95 | 100 | 78 | 79 | 103 | 104 | 79 | 79 | 95 | 100 | 79 | 79 | 95 | 96 | 75 | 4 |
3 | 76 | 94 | 102 | 76 | 76 | 94 | 102 | 76 | 76 | 94 | 102 | 76 | 80 | 95 | 100 | 78 | 80 | 95 | 100 | 78 | 80 | 91 | 100 | 78 | 79 | 95 | 100 | 79 | 79 | 95 | 96 | 75 | 79 | 95 | 100 | 75 | 4 |
4 | 76 | 94 | 102 | 76 | 76 | 94 | 102 | 76 | 76 | 89 | 94 | 76 | 80 | 95 | 100 | 78 | 80 | 91 | 100 | 78 | 80 | 91 | 100 | 74 | 79 | 95 | 96 | 75 | 79 | 95 | 100 | 75 | 75 | 95 | 100 | 79 | 4 |
dataset.shape
(6435, 37)
dataset.columns.to_list()
['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Attribute11', 'Attribute12', 'Attribute13', 'Attribute14', 'Attribute15', 'Attribute16', 'Attribute17', 'Attribute18', 'Attribute19', 'Attribute20', 'Attribute21', 'Attribute22', 'Attribute23', 'Attribute24', 'Attribute25', 'Attribute26', 'Attribute27', 'Attribute28', 'Attribute29', 'Attribute30', 'Attribute31', 'Attribute32', 'Attribute33', 'Attribute34', 'Attribute35', 'Attribute36', 'Class']
data = dataset.sample(frac=0.75, random_state=1234)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))
Data for Modeling: (4826, 37) Unseen Data For Predictions: (1609, 37)
env_setup = setup(data = data, target = 'Class', session_id=1234)
Description | Value | |
---|---|---|
0 | session_id | 1234 |
1 | Target | Class |
2 | Target Type | Multiclass |
3 | Label Encoded | None |
4 | Original Data | (4826, 37) |
5 | Missing Values | False |
6 | Numeric Features | 36 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (3378, 36) |
12 | Transformed Test Set | (1448, 36) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 3d98 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Remove Perfect Collinearity | True |
45 | Clustering | False |
46 | Clustering Iteration | None |
47 | Polynomial Features | False |
48 | Polynomial Degree | None |
49 | Trignometry Features | False |
50 | Polynomial Threshold | None |
51 | Group Features | False |
52 | Feature Selection | False |
53 | Feature Selection Method | classic |
54 | Features Selection Threshold | None |
55 | Feature Interaction | False |
56 | Feature Ratio | False |
57 | Interaction Threshold | None |
58 | Fix Imbalance | False |
59 | Fix Imbalance Method | SMOTE |
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
# --------------------------------------
best_model = compare_models()
# --------------------------------------
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lightgbm | Light Gradient Boosting Machine | 0.9195 | 0.9911 | 0.8912 | 0.9196 | 0.9173 | 0.9000 | 0.9006 | 0.4120 |
catboost | CatBoost Classifier | 0.9138 | 0.9919 | 0.8835 | 0.9123 | 0.9110 | 0.8929 | 0.8935 | 10.1880 |
xgboost | Extreme Gradient Boosting | 0.9100 | 0.9908 | 0.8796 | 0.9091 | 0.9076 | 0.8882 | 0.8888 | 1.3480 |
et | Extra Trees Classifier | 0.9079 | 0.9915 | 0.8725 | 0.9064 | 0.9045 | 0.8854 | 0.8862 | 0.3200 |
rf | Random Forest Classifier | 0.9032 | 0.9898 | 0.8674 | 0.9018 | 0.8994 | 0.8795 | 0.8804 | 0.4360 |
knn | K Neighbors Classifier | 0.8928 | 0.9823 | 0.8637 | 0.8933 | 0.8920 | 0.8671 | 0.8675 | 0.0780 |
gbc | Gradient Boosting Classifier | 0.8911 | 0.9884 | 0.8578 | 0.8897 | 0.8886 | 0.8647 | 0.8653 | 2.8260 |
qda | Quadratic Discriminant Analysis | 0.8520 | 0.9709 | 0.7970 | 0.8400 | 0.8381 | 0.8155 | 0.8176 | 0.0320 |
lda | Linear Discriminant Analysis | 0.8440 | 0.9785 | 0.7798 | 0.8398 | 0.8366 | 0.8052 | 0.8072 | 0.0170 |
dt | Decision Tree Classifier | 0.8390 | 0.9027 | 0.8040 | 0.8418 | 0.8392 | 0.8007 | 0.8011 | 0.0230 |
lr | Logistic Regression | 0.8067 | 0.9594 | 0.7397 | 0.7885 | 0.7879 | 0.7581 | 0.7617 | 0.8460 |
nb | Naive Bayes | 0.7948 | 0.9624 | 0.7813 | 0.8241 | 0.8041 | 0.7487 | 0.7510 | 0.0090 |
ridge | Ridge Classifier | 0.7679 | 0.0000 | 0.6428 | 0.7691 | 0.7015 | 0.7040 | 0.7207 | 0.0340 |
svm | SVM - Linear Kernel | 0.6631 | 0.0000 | 0.6262 | 0.6986 | 0.6256 | 0.5853 | 0.6174 | 0.0780 |
ada | Ada Boost Classifier | 0.6294 | 0.8620 | 0.5727 | 0.6246 | 0.6062 | 0.5406 | 0.5530 | 0.1360 |
dummy | Dummy Classifier | 0.2386 | 0.5000 | 0.1667 | 0.0569 | 0.0919 | 0.0000 | 0.0000 | 0.0070 |
lightgbm = create_model('lightgbm')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9201 | 0.9921 | 0.8919 | 0.9185 | 0.9184 | 0.9008 | 0.9012 |
1 | 0.9172 | 0.9891 | 0.8938 | 0.9206 | 0.9170 | 0.8972 | 0.8978 |
2 | 0.9320 | 0.9912 | 0.9028 | 0.9323 | 0.9289 | 0.9153 | 0.9161 |
3 | 0.9260 | 0.9945 | 0.9132 | 0.9271 | 0.9265 | 0.9085 | 0.9086 |
4 | 0.9172 | 0.9887 | 0.8885 | 0.9146 | 0.9141 | 0.8971 | 0.8976 |
5 | 0.9112 | 0.9909 | 0.8774 | 0.9103 | 0.9081 | 0.8897 | 0.8905 |
6 | 0.9201 | 0.9937 | 0.8822 | 0.9274 | 0.9162 | 0.9004 | 0.9023 |
7 | 0.9083 | 0.9910 | 0.8846 | 0.9059 | 0.9063 | 0.8864 | 0.8867 |
8 | 0.9407 | 0.9915 | 0.9215 | 0.9411 | 0.9403 | 0.9264 | 0.9266 |
9 | 0.9021 | 0.9887 | 0.8562 | 0.8986 | 0.8978 | 0.8779 | 0.8788 |
Mean | 0.9195 | 0.9911 | 0.8912 | 0.9196 | 0.9173 | 0.9000 | 0.9006 |
SD | 0.0107 | 0.0019 | 0.0176 | 0.0122 | 0.0116 | 0.0134 | 0.0133 |
tuned_lightgbm = tune_model(lightgbm)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9142 | 0.9915 | 0.8865 | 0.9126 | 0.9126 | 0.8934 | 0.8937 |
1 | 0.9142 | 0.9908 | 0.8917 | 0.9178 | 0.9138 | 0.8934 | 0.8940 |
2 | 0.9172 | 0.9897 | 0.8789 | 0.9135 | 0.9119 | 0.8968 | 0.8977 |
3 | 0.9112 | 0.9942 | 0.8896 | 0.9122 | 0.9115 | 0.8900 | 0.8902 |
4 | 0.8964 | 0.9862 | 0.8707 | 0.8933 | 0.8942 | 0.8716 | 0.8718 |
5 | 0.8935 | 0.9891 | 0.8540 | 0.8883 | 0.8891 | 0.8677 | 0.8684 |
6 | 0.9172 | 0.9943 | 0.8828 | 0.9184 | 0.9131 | 0.8970 | 0.8981 |
7 | 0.9024 | 0.9891 | 0.8816 | 0.9017 | 0.9012 | 0.8792 | 0.8797 |
8 | 0.9080 | 0.9902 | 0.8827 | 0.9059 | 0.9060 | 0.8857 | 0.8861 |
9 | 0.8843 | 0.9857 | 0.8327 | 0.8780 | 0.8769 | 0.8556 | 0.8570 |
Mean | 0.9059 | 0.9901 | 0.8751 | 0.9042 | 0.9030 | 0.8830 | 0.8837 |
SD | 0.0107 | 0.0027 | 0.0175 | 0.0129 | 0.0120 | 0.0134 | 0.0133 |
print(tuned_lightgbm)
LGBMClassifier(bagging_fraction=0.9, bagging_freq=0, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=1.0, importance_type='split', learning_rate=0.3, max_depth=-1, min_child_samples=61, min_child_weight=0.001, min_split_gain=0.3, n_estimators=190, n_jobs=-1, num_leaves=20, objective=None, random_state=1234, reg_alpha=0.15, reg_lambda=0.0001, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
plot_model(tuned_lightgbm, plot = 'auc')
plot_model(tuned_lightgbm, plot = 'pr')
plot_model(tuned_lightgbm, plot='feature')
plot_model(tuned_lightgbm, plot = 'confusion_matrix')
plot_model(tuned_lightgbm, plot = 'learning')
#plot_model(tuned_lightgbm, plot = 'threshold')
plot_model(tuned_lightgbm, plot = 'boundary')
plot_model(tuned_lightgbm, plot = 'error')
catboost = create_model('catboost')
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9142 | 0.9942 | 0.8804 | 0.9126 | 0.9117 | 0.8932 | 0.8937 |
1 | 0.9290 | 0.9920 | 0.9131 | 0.9311 | 0.9291 | 0.9120 | 0.9124 |
2 | 0.9024 | 0.9916 | 0.8644 | 0.8964 | 0.8975 | 0.8785 | 0.8791 |
3 | 0.9290 | 0.9944 | 0.9140 | 0.9298 | 0.9293 | 0.9121 | 0.9122 |
4 | 0.9083 | 0.9894 | 0.8771 | 0.9080 | 0.9051 | 0.8859 | 0.8867 |
5 | 0.9142 | 0.9911 | 0.8818 | 0.9131 | 0.9111 | 0.8934 | 0.8944 |
6 | 0.9172 | 0.9935 | 0.8783 | 0.9143 | 0.9115 | 0.8969 | 0.8980 |
7 | 0.9142 | 0.9924 | 0.8912 | 0.9125 | 0.9127 | 0.8935 | 0.8937 |
8 | 0.9080 | 0.9910 | 0.8768 | 0.9066 | 0.9051 | 0.8855 | 0.8862 |
9 | 0.9021 | 0.9899 | 0.8585 | 0.8989 | 0.8972 | 0.8778 | 0.8789 |
Mean | 0.9138 | 0.9919 | 0.8835 | 0.9123 | 0.9110 | 0.8929 | 0.8935 |
SD | 0.0090 | 0.0016 | 0.0173 | 0.0107 | 0.0105 | 0.0114 | 0.0112 |
tuned_catboost = tune_model(catboost)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.9142 | 0.9919 | 0.8755 | 0.9131 | 0.9098 | 0.8930 | 0.8940 |
1 | 0.9112 | 0.9893 | 0.8871 | 0.9123 | 0.9104 | 0.8897 | 0.8903 |
2 | 0.9112 | 0.9875 | 0.8781 | 0.9089 | 0.9072 | 0.8895 | 0.8903 |
3 | 0.9053 | 0.9924 | 0.8867 | 0.9060 | 0.9053 | 0.8828 | 0.8830 |
4 | 0.8994 | 0.9876 | 0.8658 | 0.8977 | 0.8959 | 0.8749 | 0.8757 |
5 | 0.9112 | 0.9883 | 0.8772 | 0.9091 | 0.9080 | 0.8897 | 0.8904 |
6 | 0.9112 | 0.9926 | 0.8701 | 0.9085 | 0.9065 | 0.8896 | 0.8905 |
7 | 0.8935 | 0.9890 | 0.8657 | 0.8907 | 0.8907 | 0.8679 | 0.8683 |
8 | 0.8902 | 0.9851 | 0.8542 | 0.8886 | 0.8867 | 0.8632 | 0.8640 |
9 | 0.8902 | 0.9879 | 0.8415 | 0.8869 | 0.8846 | 0.8629 | 0.8642 |
Mean | 0.9038 | 0.9892 | 0.8702 | 0.9022 | 0.9005 | 0.8803 | 0.8811 |
SD | 0.0091 | 0.0023 | 0.0135 | 0.0097 | 0.0095 | 0.0114 | 0.0113 |
print(tuned_catboost)
<catboost.core.CatBoostClassifier object at 0x7f8350631490>
plot_model(tuned_catboost, plot = 'auc')
plot_model(tuned_catboost, plot = 'pr')