Breast Cancer Prediction in R

In [3]:
# Breast Cancer

# binary classification, categorical attributes

# Description: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)
# World-Class Results: http://www.is.umk.pl/projects/datasets.html#Wisconsin

# load libraries
library(mlbench)
library(caret)

# use multiple cores
library(doMC)
registerDoMC(cores=8)

# Load data
data(BreastCancer)

# Split out validation dataset
# create a list of 80% of the rows in the original dataset we can use for training
set.seed(7)
validation_index <- createDataPartition(BreastCancer$Class, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- BreastCancer[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- BreastCancer[validation_index,]


# data analysis

# dimensions of dataaset
dim(dataset)

# peek
head(dataset, n=20)

# types
sapply(dataset, class)

# Remove redundant variable Id
dataset <- dataset[,-1]
# convert input values to numeric
for(i in 1:9) {
    dataset[,i] <- as.numeric(as.character(dataset[,i]))
}

# summary
summary(dataset)

# class distribution
cbind(freq=table(dataset$Class), percentage=prop.table(table(dataset$Class))*100)

# summarize correlations between input variables
complete_cases <- complete.cases(dataset)
cor(dataset[complete_cases,1:9])

# histograms each attribute
par(mfrow=c(3,3))
for(i in 1:9) {
    hist(dataset[,i], main=names(dataset)[i])
}

# density plot for each attribute
par(mfrow=c(3,3))
complete_cases <- complete.cases(dataset)
for(i in 1:9) {
    plot(density(dataset[complete_cases,i]), main=names(dataset)[i])
}

# boxplots for each attribute
par(mfrow=c(3,3))
for(i in 1:9) {
    boxplot(dataset[,i], main=names(dataset)[i])
}

# scatterplot matrix
jittered_x <- sapply(dataset[,1:9], jitter)
pairs(jittered_x, names(dataset[,1:9]), col=dataset$Class)

# bar plots of each variable by class
par(mfrow=c(3,3))
for(i in 1:9) {
    barplot(table(dataset$Class,dataset[,i]), main=names(dataset)[i], legend.text=unique(dataset$Class))
}



# Evaluate Algorithms

# 10-fold cross validation with 3 repeats
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

# LG
set.seed(7)
fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, trControl=control, na.action=na.omit)

# LDA
set.seed(7)
fit.lda <- train(Class~., data=dataset, method="lda", metric=metric, trControl=control, na.action=na.omit)

# GLMNET
set.seed(7)
fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, trControl=control, na.action=na.omit)

# KNN
set.seed(7)
fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, trControl=control, na.action=na.omit)

# CART
set.seed(7)
fit.cart <- train(Class~., data=dataset, method="rpart", metric=metric, trControl=control, na.action=na.omit)

# Naive Bayes
set.seed(7)
fit.nb <- train(Class~., data=dataset, method="nb", metric=metric, trControl=control, na.action=na.omit)

# SVM
set.seed(7)
fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, trControl=control, na.action=na.omit)

# Compare algorithms
results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn, CART=fit.cart, NB=fit.nb, SVM=fit.svm))
summary(results)
dotplot(results)


# Evaluate Algorithms Transform

# 10-fold cross validation with 3 repeats
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

# LG
set.seed(7)
fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# LDA
set.seed(7)
fit.lda <- train(Class~., data=dataset, method="lda", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# GLMNET
set.seed(7)
fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# KNN
set.seed(7)
fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# CART
set.seed(7)
fit.cart <- train(Class~., data=dataset, method="rpart", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# Naive Bayes
set.seed(7)
fit.nb <- train(Class~., data=dataset, method="nb", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# SVM
set.seed(7)
fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)

# Compare algorithms
transform_results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn, CART=fit.cart, NB=fit.nb, SVM=fit.svm))
summary(transform_results)
dotplot(transform_results)



# Tune SVM

# 10-fold cross validation with 3 repeats
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
set.seed(7)
grid <- expand.grid(.sigma=c(0.025, 0.05, 0.1, 0.15), .C=seq(1, 10, by=1))
fit.svm <- train(Class~., data=dataset, method="svmRadial", metric=metric, tuneGrid=grid, preProc=c("BoxCox"), trControl=control, na.action=na.omit)
print(fit.svm)
plot(fit.svm)


# Tune kNN

# 10-fold cross validation with 3 repeats
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
set.seed(7)
grid <- expand.grid(.k=seq(1,20,by=1))
fit.knn <- train(Class~., data=dataset, method="knn", metric=metric, tuneGrid=grid, preProc=c("BoxCox"), trControl=control, na.action=na.omit)
print(fit.knn)
plot(fit.knn)



# Ensembles: Boosting and Bagging


# 10-fold cross validation with 3 repeats
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
# Bagged CART
set.seed(7)
fit.treebag <- train(Class~., data=dataset, method="treebag", metric=metric, trControl=control, na.action=na.omit)
# Random Forest
set.seed(7)
fit.rf <- train(Class~., data=dataset, method="rf", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)
# Stochastic Gradient Boosting
set.seed(7)
fit.gbm <- train(Class~., data=dataset, method="gbm", metric=metric, preProc=c("BoxCox"), trControl=control, verbose=FALSE, na.action=na.omit)
# C5.0
set.seed(7)
fit.c50 <- train(Class~., data=dataset, method="C5.0", metric=metric, preProc=c("BoxCox"), trControl=control, na.action=na.omit)
# Compare results
ensemble_results <- resamples(list(BAG=fit.treebag, RF=fit.rf, GBM=fit.gbm, C50=fit.c50))
summary(ensemble_results)
dotplot(ensemble_results)
  1. 560
  2. 11
IdCl.thicknessCell.sizeCell.shapeMarg.adhesionEpith.c.sizeBare.nucleiBl.cromatinNormal.nucleoliMitosesClass
21002945 5 4 4 5 7 10 3 2 1 benign
41016277 6 8 8 1 3 4 3 7 1 benign
51017023 4 1 1 3 2 1 3 1 1 benign
61017122 8 10 10 8 7 10 9 7 1 malignant
71018099 1 1 1 1 2 10 3 1 1 benign
91033078 2 1 1 1 2 1 1 1 5 benign
101033078 4 2 1 1 2 1 2 1 1 benign
111035283 1 1 1 1 1 1 3 1 1 benign
121036172 2 1 1 1 2 1 2 1 1 benign
141043999 1 1 1 1 2 3 3 1 1 benign
151044572 8 7 5 10 7 9 5 5 4 malignant
161047630 7 4 6 4 6 1 4 3 1 malignant
171048672 4 1 1 1 2 1 2 1 1 benign
181049815 4 1 1 1 2 1 3 1 1 benign
191050670 10 7 7 6 4 10 4 1 2 malignant
201050718 6 1 1 1 2 1 3 1 1 benign
211054590 7 3 2 10 5 10 5 4 4 malignant
221054593 10 5 5 3 6 7 7 10 1 malignant
231056784 3 1 1 1 2 1 2 1 1 benign
241057013 8 4 5 1 2 NA 7 3 1 malignant
$Id
'character'
$Cl.thickness
  1. 'ordered'
  2. 'factor'
$Cell.size
  1. 'ordered'
  2. 'factor'
$Cell.shape
  1. 'ordered'
  2. 'factor'
$Marg.adhesion
  1. 'ordered'
  2. 'factor'
$Epith.c.size
  1. 'ordered'
  2. 'factor'
$Bare.nuclei
'factor'
$Bl.cromatin
'factor'
$Normal.nucleoli
'factor'
$Mitoses
'factor'
$Class
'factor'
  Cl.thickness      Cell.size        Cell.shape     Marg.adhesion   
 Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
 1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
 Median : 4.000   Median : 1.000   Median : 2.000   Median : 1.000  
 Mean   : 4.384   Mean   : 3.116   Mean   : 3.198   Mean   : 2.875  
 3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
 Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
                                                                    
  Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli 
 Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
 1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
 Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
 Mean   : 3.232   Mean   : 3.468   Mean   : 3.405   Mean   : 2.877  
 3rd Qu.: 4.000   3rd Qu.: 5.000   3rd Qu.: 4.250   3rd Qu.: 4.000  
 Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
                  NA's   :13                                        
    Mitoses             Class    
 Min.   : 1.000   benign   :367  
 1st Qu.: 1.000   malignant:193  
 Median : 1.000                  
 Mean   : 1.611                  
 3rd Qu.: 1.000                  
 Max.   :10.000                  
                                 
freqpercentage
benign367 65.53571
malignant193 34.46429
Cl.thicknessCell.sizeCell.shapeMarg.adhesionEpith.c.sizeBare.nucleiBl.cromatinNormal.nucleoliMitoses
Cl.thickness1.00000000.62008840.63029170.47417330.50895570.56007700.52907330.51439330.3426018
Cell.size0.62008841.00000000.90113400.71411500.74048240.66872260.75027000.70721820.4506532
Cell.shape0.63029170.90113401.00000000.68462060.70434230.68967240.72761140.71271550.4345125
Marg.adhesion0.47417330.71411500.68462061.00000000.58606600.66601650.66605330.60310360.4314910
Epith.c.size0.50895570.74048240.70434230.58606601.00000000.55684060.61020320.64333640.4775271
Bare.nuclei0.56007700.66872260.68967240.66601650.55684061.00000000.66684830.57957940.3539473
Bl.cromatin0.52907330.75027000.72761140.66605330.61020320.66684831.00000000.68385470.3545122
Normal.nucleoli0.51439330.70721820.71271550.60310360.64333640.57957940.68385471.00000000.4084127
Mitoses0.34260180.45065320.43451250.43149100.47752710.35394730.35451220.40841271.0000000