Learn by Coding Examples in Applied Machine Learning

Bagging Ensemble Machine Learning algorithms in Python using scikit-learn?

In [6]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

Bagged Decision Trees for Classification

In [7]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# load data
filename = 'pima.indians.diabetes.data.csv'

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pd.read_csv(filename, names=names)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

cart = DecisionTreeClassifier()
num_trees = 1000
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

results = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(); print("Accuracy Results: ")
print(results.mean())
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Accuracy Results: 
0.7642515379357484

Random Forest Classification

In [8]:
import pandas
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

# load data
filename = 'pima.indians.diabetes.data.csv'

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pd.read_csv(filename, names=names)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

seed = 7
num_trees = 1000
max_features = 6

kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(); print("Accuracy Results: ")
print(results.mean())
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Accuracy Results: 
0.7668318523581681

Extra Trees Classification

In [9]:
import pandas
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier

# load data
filename = 'pima.indians.diabetes.data.csv'

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pd.read_csv(filename, names=names)
print(); print(dataframe.head())

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

seed = 7
num_trees = 1000
max_features = 7

kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(); print("Accuracy Results: ")
print(results.mean())
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Accuracy Results: 
0.7668318523581681
In [ ]: