How to compute CLARA (Clustering Large Applications) in R¶

# -----------------------------------------------------------
# How to compute CLARA (Clustering Large Applications) in R
# -----------------------------------------------------------
# load library
library(cluster)
library(factoextra)

# generate Simulated Data 
# Generate 500 objects, divided into 2 clusters.
df <- rbind(cbind(rnorm(200,0,8), rnorm(200,0,8)),
            cbind(rnorm(300,50,8), rnorm(300,50,8)),
            cbind(rnorm(400,100,8), rnorm(400,100,8)))

# Specify column and row names
colnames(df) <- c("x", "y")
rownames(df) <- paste0("S", 1:nrow(df))

# Previewing the data
head(df, nrow = 6)

# optimal number of clusters
fviz_nbclust(df, clara, method = "silhouette")+
  theme_classic()

# Compute CLARA
clara.res <- clara(df, 3, samples = 50, pamLike = TRUE)

# Print components of clara.res
print(clara.res)

# Add clustering result to the Data
dd <- cbind(df, cluster = clara.res$cluster)
head(dd, n = 4)

# Visualise clusters
fviz_cluster(clara.res, 
             palette = c("#00AFBB", "#FC4E07", "#E7B800"), # color palette
             ellipse.type = "t", # Concentration ellipse
             geom = "point", pointsize = 1,
             ggtheme = theme_classic()
)

Call:	 clara(x = df, k = 3, samples = 50, pamLike = TRUE) 
Medoids:
              x            y
S7     1.464661  -0.04113031
S478  49.509561  49.48060014
S755 100.550037 100.17425530
Objective function:	 9.991144
Clustering vector: 	 Named int [1:900] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "names")= chr [1:900] "S1" "S2" "S3" "S4" "S5" "S6" "S7" ...
Cluster sizes:	    	 200 300 400 
Best sample:
 [1] S7   S26  S47  S73  S87  S133 S167 S191 S196 S197 S211 S229 S231 S259 S264
[16] S281 S287 S290 S317 S344 S381 S390 S410 S433 S440 S441 S445 S474 S478 S479
[31] S524 S530 S537 S585 S606 S712 S736 S738 S741 S755 S841 S854 S878 S879 S887
[46] S896

Available components:
 [1] "sample"     "medoids"    "i.med"      "clustering" "objective" 
 [6] "clusinfo"   "diss"       "call"       "silinfo"    "data"

	x	y
S1	6.4044343	-8.748718
S2	9.5216530	1.539533
S3	-13.5164453	-1.009283
S4	9.9159671	-11.107457
S5	-0.8717278	3.759218
S6	-0.9379357	7.683059

	x	y	cluster
S1	6.404434	-8.748718	1
S2	9.521653	1.539533	1
S3	-13.516445	-1.009283	1
S4	9.915967	-11.107457	1