# -----------------------------------------------------------------
# How to setup a Machine Learning Classification problem in R
# -----------------------------------------------------------------
# load libraries
library(mlbench)
library(caret)
# load data
data(PimaIndiansDiabetes)
# rename dataset to keep code below generic
dataset <- PimaIndiansDiabetes
dim(dataset)
sapply(dataset, class)
# Pre-Processing of DataSet i.e. train : test split
train_test_index <- createDataPartition(dataset$diabetes, p=0.67, list=FALSE)
training_dataset <- dataset[train_test_index,]
testing_dataset <- dataset[-train_test_index,]
# setup cross validation and control parameters
control <- trainControl(method="repeatedcv", number=10, repeats = 10, verbose = FALSE, search = "grid")
metric <- "Accuracy"
# Training process
# Fit / train a Linear Discriminant Analysis model to the training dataset
fit.lda <- caret::train(diabetes~., data=training_dataset, method="lda", metric=metric,
preProc=c("center", "scale"), trControl=control)
# Fit / train a Logistic Regression model to the training dataset
fit.glm <- caret::train(diabetes~., data=training_dataset, method="glm", metric=metric,
preProc=c("center", "scale"), trControl=control)
# collect the results of trained models
results <- resamples(list(LDA = fit.lda, GLM = fit.glm))
# Summarize the fitted models
summary(results)
# Plot and rank the fitted models
dotplot(results)
bwplot(results)
# Test skill of the BEST trained model on validation/testing dataset
predictions_LDA <- predict(fit.lda, newdata=testing_dataset)
# Evaluate the BEST trained model and print results
res_ <- caret::confusionMatrix(predictions_LDA, testing_dataset$diabetes)
print("Results from the BEST trained model ... ...\n");
print(round(res_$overall, digits = 3))