In statistics, Naive Bayes classifiers are a family of simple "probabilistic classifiers" based on applying Bayes' theorem with strong (naive) independence assumptions between the features. They are among the simplest Bayesian network models.But they could be coupled with Kernel density estimation and achieve higher accuracy levels.
Naive Bayes classifiers are highly scalable, requiring a number of parameters linear in the number of variables (features/predictors) in a learning problem. Maximum-likelihood training can be done by evaluating a closed-form expression, which takes linear time, rather than by expensive iterative approximation as used for many other types of classifiers.
In the statistics and computer science literature, naive Bayes models are known under a variety of names, including simple Bayes and independence Bayes. All these names reference the use of Bayes' theorem in the classifier's decision rule, but naïve Bayes is not (necessarily) a Bayesian method.
In computer graphics, a digital differential analyzer (DDA) is hardware or software used for interpolation of variables over an interval between start and end point. DDAs are used for rasterization of lines, triangles and polygons. They can be extended to non linear functions, such as perspective correct texture mapping, quadratic curves, and traversing voxels.
Here we are going to implement Naive Bayes, NB and DDA algorithms using Telecom Churn Dataset.
library(DBI)
library(corrgram)
library(caret)
library(gridExtra)
library(ggpubr)
Today is a good practice to start parallelizing your code. The common motivation behind parallel computing is that something is taking too long time. For somebody that means any computation that takes more than 3 minutes – this because parallelization is incredibly simple and most tasks that take time are embarrassingly parallel. Here are a few common tasks that fit the description:
# process in parallel on Windows
library(doParallel)
cl <- makeCluster(detectCores(), type='PSOCK')
registerDoParallel(cl)
# process in parallel on Mac OSX and UNIX like systems
library(doMC)
registerDoMC(cores = 4)
#Set working directory where CSV is located
#getwd()
#setwd("...YOUR WORKING DIRECTORY WITH A DATASET...")
#getwd()
# Load the DataSets:
dataSet <- read.csv("TelcoCustomerChurnDataset.csv", header = TRUE, sep = ',')
colnames(dataSet) #Check the dataframe column names
# Print top 10 rows in the dataSet
head(dataSet, 10)
# Print last 10 rows in the dataSet
tail(dataSet, 10)
# Dimention of Dataset
dim(dataSet)
# Check Data types of each column
table(unlist(lapply(dataSet, class)))
# Check Data types of individual column
data.class(dataSet$Account_Length)
data.class(dataSet$Vmail_Message)
data.class(dataSet$Day_Mins)
data.class(dataSet$Eve_Mins)
data.class(dataSet$Night_Mins)
data.class(dataSet$Intl_Mins)
data.class(dataSet$CustServ_Calls)
data.class(dataSet$Intl_Plan)
data.class(dataSet$Vmail_Plan)
data.class(dataSet$Day_Calls)
data.class(dataSet$Day_Charge)
data.class(dataSet$Eve_Calls)
data.class(dataSet$Eve_Charge)
data.class(dataSet$Night_Calls)
data.class(dataSet$Night_Charge)
data.class(dataSet$Intl_Calls)
data.class(dataSet$Intl_Charge)
data.class(dataSet$State)
data.class(dataSet$Phone)
data.class(dataSet$Churn)
dataSet$Intl_Plan <- as.numeric(dataSet$Intl_Plan)
dataSet$Vmail_Plan <- as.numeric(dataSet$Vmail_Plan)
dataSet$State <- as.numeric(dataSet$State)
# Check Data types of each column
table(unlist(lapply(dataSet, class)))
# Find out if there is missing value in rows
rowSums(is.na(dataSet))
# Find out if there is missing value in columns
colSums(is.na(dataSet))
#Checking missing value with the mice package
library(mice)
md.pattern(dataSet)
#Checking missing value with the VIM package
library(VIM)
mice_plot <- aggr(dataSet, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(dataSet[1:21]), cex.axis=.9,
gap=3, ylab=c("Missing data","Pattern"))
After the observation, we can claim that dataset contains no missing values.
# Selecting just columns with numeric data type
numericalCols <- colnames(dataSet[c(1:7,9:20)])
Difference between the lapply and sapply functions (we will use them in the next 2 cells):
We use lapply - when we want to apply a function to each element of a list in turn and get a list back.
We use sapply - when we want to apply a function to each element of a list in turn, but we want a vector back, rather than a list.
#Sum
lapply(dataSet[numericalCols], FUN = sum)
#Mean
lapply(dataSet[numericalCols], FUN = mean)
#median
lapply(dataSet[numericalCols], FUN = median)
#Min
lapply(dataSet[numericalCols], FUN = min)
#Max
lapply(dataSet[numericalCols], FUN = max)
#Length
lapply(dataSet[numericalCols], FUN = length)
# Sum
sapply(dataSet[numericalCols], FUN = sum)
# Mean
sapply(dataSet[numericalCols], FUN = mean)
# Median
sapply(dataSet[numericalCols], FUN = median)
# Min
sapply(dataSet[numericalCols], FUN = min)
# Max
sapply(dataSet[numericalCols], FUN = max)
# Length
sapply(dataSet[numericalCols], FUN = length)
In the next few cells, you will find three different options on how to aggregate data.
# OPTION 1: (Using Aggregate FUNCTION - all variables together)
aggregate(dataSet[numericalCols], list(dataSet$Churn), summary)
# OPTION 2: (Using Aggregate FUNCTION - variables separately)
aggregate(dataSet$Intl_Mins, list(dataSet$Churn), summary)
aggregate(dataSet$Day_Mins, list(dataSet$Churn), summary)
aggregate(dataSet$Night_Mins, list(dataSet$Churn), summary)
# OPTION 3: (Using "by" FUNCTION instead of "Aggregate" FUNCTION)
by(dataSet$Intl_Mins, dataSet[8], FUN = summary)
by(dataSet$Day_Mins, dataSet[8], FUN = summary)
by(dataSet$Night_Mins, dataSet[8], FUN = summary)
# Correlations/covariances among numeric variables
library(Hmisc)
cor(dataSet[c(2,5,11,13,16,18)], use="complete.obs", method="kendall")
cov(dataSet[c(2,5,11,13,16,18)], use="complete.obs")
# Correlations with significance levels
rcorr(as.matrix(dataSet[c(2,5,11,13,16,18)]), type="pearson")
# Pie Chart from data
mytable <- table(dataSet$Churn)
lbls <- paste(names(mytable), "\n", mytable, sep="")
pie(mytable, labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart of Classes\n (with sample sizes)")
# Barplot of categorical data
par(mfrow=c(1,1))
barplot(table(dataSet$Churn), ylab = "Count",
col=c("darkblue","red"))
barplot(prop.table(table(dataSet$Churn)), ylab = "Proportion",
col=c("darkblue","red"))
barplot(table(dataSet$Churn), xlab = "Count", horiz = TRUE,
col=c("darkblue","red"))
barplot(prop.table(table(dataSet$Churn)), xlab = "Proportion", horiz = TRUE,
col=c("darkblue","red"))
# Scatterplot Matrices from the glus Package
library(gclus)
dta <- dataSet[c(2,5,11,13,16,18)] # get data
dta.r <- abs(cor(dta)) # get correlations
dta.col <- dmat.color(dta.r) # get colors
# reorder variables so those with highest correlation are closest to the diagonal
dta.o <- order.single(dta.r)
cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
main="Variables Ordered and Colored by Correlation" )
corrgram(dataSet[c(2,5,11,13,16,18)], order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt, main=" ")
# More graphs on correlatios amaong data
# Using "Hmisc"
res2 <- rcorr(as.matrix(dataSet[,c(2,5,11,13,16,18)]))
# Extract the correlation coefficients
res2$r
# Extract p-values
res2$P
# Using "corrplot"
library(corrplot)
library(RColorBrewer)
corrplot(res2$r, type = "upper", order = "hclust", col=brewer.pal(n=8, name="RdYlBu"),
tl.col = "black", tl.srt = 45)
corrplot(res2$r, type = "lower", order = "hclust", col=brewer.pal(n=8, name="RdYlBu"),
tl.col = "black", tl.srt = 45)
# Using PerformanceAnalytics
library(PerformanceAnalytics)
data <- dataSet[, c(2,5,11,13,16,18)]
chart.Correlation(data, histogram=TRUE, pch=19)
# Using Colored Headmap
col <- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = res2$r, col = col, symm = TRUE)
We should notice that Night_Mins and Night_Charge have a strong, linear, positive relationship.
train_test_index <- createDataPartition(dataSet$Churn, p=0.75, list=FALSE)
training_dataset <- dataSet[, c(1:20)][train_test_index,]
testing_dataset <- dataSet[, c(1:20)][-train_test_index,]
dim(training_dataset)
dim(testing_dataset)
control <- trainControl(method="repeatedcv", # repeatedcv / adaptive_cv
number=2, repeats = 2,
verbose = TRUE, search = "grid",
allowParallel = TRUE)
metric <- "Accuracy"
tuneLength = 2
names(getModelInfo())
getModelInfo("naive_bayes"); getModelInfo("nb"); getModelInfo("dda");
# naive_bayes
fit.naive_bayes <- caret::train(Churn~., data=training_dataset, method="naive_bayes",
metric=metric,
trControl=control,
verbose = TRUE
)
print(fit.naive_bayes)
# nb
fit.nb <- caret::train(Churn~., data=training_dataset, method="nb",
metric=metric,
trControl=control,
verbose = TRUE
)
print(fit.nb)
# dda
fit.dda <- caret::train(Churn~., data=training_dataset, method="dda",
metric=metric,
trControl=control,
verbose = TRUE
)
print(fit.dda)
# naive_bayes
fit.naive_bayes_preProc <- caret::train(Churn~., data=training_dataset, method="naive_bayes",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
verbose = TRUE
)
print(fit.naive_bayes_preProc)
# nb
fit.nb_preProc <- caret::train(Churn~., data=training_dataset, method="nb",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
verbose = TRUE
)
print(fit.nb_preProc)
# dda
fit.dda_preProc <- caret::train(Churn~., data=training_dataset, method="dda",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
verbose = TRUE
)
print(fit.dda_preProc)
# naive_bayes
fit.naive_bayes_automaticGrid <- caret::train(Churn~., data=training_dataset, method="naive_bayes",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneLength = tuneLength,
verbose = TRUE
)
print(fit.naive_bayes_automaticGrid)
# nb
fit.nb_automaticGrid <- caret::train(Churn~., data=training_dataset, method="nb",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneLength = tuneLength,
verbose = TRUE
)
print(fit.nb_automaticGrid)
# dda
fit.dda_automaticGrid <- caret::train(Churn~., data=training_dataset, method="dda",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneLength = tuneLength,
verbose = TRUE
)
print(fit.dda_automaticGrid)
# naive_bayes
grid <- expand.grid(usekernel = c("FALSE","TRUE"),
laplace = c(seq(from = 1, to = 10, by = 1)),
adjust = c(seq(from = 1, to = 10, by = 1))
)
fit.naive_bayes_manualGrid <- caret::train(Churn~., data=training_dataset, method="naive_bayes",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneGrid = grid,
verbose = TRUE
)
print(fit.naive_bayes_manualGrid)
plot(fit.naive_bayes_manualGrid)
# nb
grid <- expand.grid(usekernel = c("FALSE","TRUE"),
fL = c(seq(from = 1, to = 10, by = 3)),
adjust = c(seq(from = 1, to = 10, by = 3))
)
fit.nb_manualGrid <- caret::train(Churn~., data=training_dataset, method="nb",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneGrid = grid,
verbose = TRUE
)
print(fit.nb_manualGrid)
# dda
grid <- expand.grid(model = c("Linear","Quadratic"),
shrinkage = c("Mean", "None", "Variance")
)
fit.dda_manualGrid <- caret::train(Churn~., data=training_dataset, method="dda",
metric=metric,
trControl=control,
preProc=c("center", "scale"),
tuneGrid = grid,
verbose = TRUE
)
print(fit.dda_manualGrid)
plot(fit.dda_manualGrid)
results <- resamples(list( trained_Model_1 = fit.naive_bayes
, trained_Model_2 = fit.nb
#, trained_Model_3 = fit.dda
, trained_Model_4 = fit.naive_bayes_preProc
, trained_Model_5 = fit.nb_preProc
#, trained_Model_6 = fit.dda_preProc
, trained_Model_7 = fit.naive_bayes_automaticGrid
, trained_Model_8 = fit.nb_automaticGrid
#, trained_Model_9 = fit.dda_automaticGrid
, trained_Model_10 = fit.naive_bayes_manualGrid
, trained_Model_11 = fit.nb_manualGrid
#, trained_Model_12 = fit.dda_manualGrid
))
summary(results)
dotplot(results)
bwplot(results)
best_trained_model <- fit.naive_bayes_automaticGrid
predictions <- predict(best_trained_model, newdata=testing_dataset)
res_ <- caret::confusionMatrix(table(predictions, testing_dataset$Churn))
print("Results from the BEST trained model ... ...\n");
print(round(res_$overall, digits = 3))
#getwd()
saveRDS(best_trained_model, "./best_trained_model.rds")
# load the model
#getwd()
saved_model <- readRDS("./best_trained_model.rds")
print(saved_model)
# make a predictions on "new data" using the final model
final_predictions <- predict(saved_model, dataSet[1:20])
confusionMatrix(table(final_predictions, dataSet$Churn))
res_ <- confusionMatrix(table(final_predictions, dataSet$Churn))
print("Results from the BEST trained model ... ...\n");
print(round(res_$overall, digits = 3))
print(res_$table)
fourfoldplot(res_$table, color = c("#CC6666", "#99CC99"),
conf.level = 0, margin = 1, main = "Confusion Matrix")