Applied Data Science Notebook in Python for Beginners to Professionals¶

An end-to-end tutorials on Cluster Analysis - Applied Machine Learning & Data Science¶

Unsupervised Learning - Comparing Clustering Algorithms in Python¶

# Suppress warnings in Jupyter Notebooks

import warnings
warnings.filterwarnings("ignore")

# Load the library
import numpy as np
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
import seaborn as sns
plt.style.use('fivethirtyeight')

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()

# Let's rename few columns
df.rename(columns={'Annual Income (k$)':'Income',
                   'Spending Score (1-100)':'Spending_score'}, inplace=True)

dat = ff.create_table(df.head())
py.iplot(dat)

# describing the data
df.describe()

# describing the data
desc = ff.create_table(df.describe())
py.iplot(desc)

# How many missing values are in each column and of what data types they are?
print(); print("Database has {} observations (instances) and {} columns (attributes).".format(df.shape[0],df.shape[1]))
print(); print("Missing values in each column:\n{}".format(df.isnull().sum()))
print(); print("Columns data types:\n{}".format(df.dtypes))

## OR

# checking if there is any NULL data
res = df.isnull().any().any()
print();
print("Is there any NULL data in the dataframe: ", res)

Database has 200 observations (instances) and 5 columns (attributes).

Missing values in each column:
CustomerID        0
Gender            0
Age               0
Income            0
Spending_score    0
dtype: int64

Columns data types:
CustomerID         int64
Gender            object
Age                int64
Income             int64
Spending_score     int64
dtype: object

Is there any NULL data in the dataframe:  False

Data Visualisation¶

plt.rcParams['figure.figsize'] = (18, 4)

plt.subplot(1, 3, 1)
sns.set(style = 'whitegrid')
sns.distplot(df['Income'])
plt.title('Distribution of Annual Income', fontsize = 10)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')


plt.subplot(1, 3, 2)
sns.set(style = 'whitegrid')
sns.distplot(df['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 10)
plt.xlabel('Range of Age')
plt.ylabel('Count')
plt.show()

plt.subplot(1, 3, 3)
sns.set(style = 'whitegrid')
sns.distplot(df['Spending_score'], color = 'green')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.xlabel('Range of Spending Score')
plt.ylabel('Count')
plt.show()

# Check the distribution of Gender

labels = ['Female', 'Male']
size = df['Gender'].value_counts()
colors = ['lightgreen', 'orange']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (6, 6)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

By looking at the above pie chart which explains about the distribution of Gender in the Mall.

Interestingly, The Females are in the lead with a share of 56% whereas the Males have a share of 44%, that's a huge gap specially when the population of Males is comparatively higher than Females.

Generate Histograms¶

plt.rcParams['figure.figsize'] = (15, 6)
sns.countplot(df['Age'], palette = 'hsv')
plt.title('Distribution of Age', fontsize = 15)
plt.show()

plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Income'], palette = 'hsv')
plt.title('Distribution of Annual Income', fontsize = 10)
plt.show()

plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Spending_score'], palette = 'hsv')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.show()

# Generate Pairwise plot
g = sns.pairplot(df[['Age', 'Income', 'Spending_score']])
#g.fig.suptitle("Pairplot for the Data")
plt.show()

# Generate the heatmap
data = df[['Age', 'Income', 'Spending_score']]

plt.rcParams['figure.figsize'] = (12, 6)
sns.heatmap(data.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()

#  Gender vs Spendscore plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Spending_score'], palette = 'Blues')
plt.title('Gender vs Spending Score', fontsize = 20)
plt.show()

#  Gender vs Income plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Income'], palette = 'Greens')
plt.title('Gender vs Income', fontsize = 20)
plt.show()

y = df['Income']
x = df['Age']
z = df['Spending_score']

sns.lineplot(x, y, color = 'blue')
sns.lineplot(x, z, color = 'pink')
plt.title('Age vs Income and Age vs Spending Score', fontsize = 20)
plt.show()

Cluster Analysis¶

KMeans Algorithm¶

df = pd.read_csv('Mall_Customers.csv')
df.rename(columns={'Annual Income (k$)':'Income',
                   'Spending Score (1-100)':'Spending_score'}, inplace=True)

X = df.drop(['CustomerID', 'Gender'], axis=1)

sns.pairplot(df.drop('CustomerID', axis=1), hue='Gender', aspect=1.5)
plt.show()

from sklearn.cluster import KMeans

clusters = []

for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(X)
    clusters.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 11)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

# Annotate arrow
ax.annotate('Possible Elbow Point', xy=(3, 140000), xytext=(3, 50000), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

ax.annotate('Possible Elbow Point', xy=(5, 80000), xytext=(5, 150000), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

plt.show()

# Visualise clusters using 3
km3 = KMeans(n_clusters=3).fit(X)

X['Labels'] = km3.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()

# Visualise clusters using 5
km3 = KMeans(n_clusters=5).fit(X)

X['Labels'] = km3.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', 5))
plt.title('KMeans with 5 Clusters')
plt.show()

# Visualise clusters with swarm plot

fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(121)
sns.swarmplot(x='Labels', y='Income', data=X, ax=ax)
ax.set_title('Labels According to Annual Income')

ax = fig.add_subplot(122)
sns.swarmplot(x='Labels', y='Spending_score', data=X, ax=ax)
ax.set_title('Labels According to Scoring History')

plt.show()

Agglomerative Hierarchical Clustering Algorithm¶

# using 3 clusters
from sklearn.cluster import AgglomerativeClustering 

agglom = AgglomerativeClustering(n_clusters=3, linkage='average').fit(X)

X['Labels'] = agglom.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', 3))
plt.title('Agglomerative with 3 Clusters')
plt.show()

# using 5 clusters
from sklearn.cluster import AgglomerativeClustering 

agglom = AgglomerativeClustering(n_clusters=5, linkage='average').fit(X)

X['Labels'] = agglom.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', 5))
plt.title('Agglomerative with 5 Clusters')
plt.show()

Density Based Clustering (DBSCAN) Algorithm¶

from sklearn.cluster import DBSCAN 

db = DBSCAN(eps=11, min_samples=6).fit(X)

X['Labels'] = db.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', np.unique(db.labels_).shape[0]))
plt.title('DBSCAN with epsilon 11, min samples 6')
plt.show()

Clusters using Mean Shift Algorithm¶

from sklearn.cluster import MeanShift, estimate_bandwidth

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.1)
ms = MeanShift(bandwidth).fit(X)

X['Labels'] = ms.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], 
                palette=sns.color_palette('hls', np.unique(ms.labels_).shape[0]))
plt.plot()
plt.title('MeanShift')
plt.show()

Compare all algorithms at one place¶

fig = plt.figure(figsize=(20,15))

##### KMeans #####
ax = fig.add_subplot(221)

km5 = KMeans(n_clusters=5).fit(X)
X['Labels'] = km5.labels_
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], style=X['Labels'],
                palette=sns.color_palette('hls', 5), s=60, ax=ax)
ax.set_title('KMeans with 5 Clusters')


##### Agglomerative Clustering #####
ax = fig.add_subplot(222)

agglom = AgglomerativeClustering(n_clusters=5, linkage='average').fit(X)
X['Labels'] = agglom.labels_
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], style=X['Labels'],
                palette=sns.color_palette('hls', 5), s=60, ax=ax)
ax.set_title('Agglomerative with 5 Clusters')


##### DBSCAN #####
ax = fig.add_subplot(223)

db = DBSCAN(eps=11, min_samples=6).fit(X)
X['Labels'] = db.labels_
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], style=X['Labels'], s=60,
                palette=sns.color_palette('hls', np.unique(db.labels_).shape[0]), ax=ax)
ax.set_title('DBSCAN with epsilon 11, min samples 6')


##### MEAN SHIFT #####
ax = fig.add_subplot(224)

bandwidth = estimate_bandwidth(X, quantile=0.1)
ms = MeanShift(bandwidth).fit(X)
X['Labels'] = ms.labels_
sns.scatterplot(X['Income'], X['Spending_score'], hue=X['Labels'], style=X['Labels'], s=60,
                palette=sns.color_palette('hls', np.unique(ms.labels_).shape[0]), ax=ax)
ax.set_title('MeanShift')

plt.tight_layout()
plt.show()

	CustomerID	Age	Income	Spending_score
count	200.000000	200.000000	200.000000	200.000000
mean	100.500000	38.850000	60.560000	50.200000
std	57.879185	13.969007	26.264721	25.823522
min	1.000000	18.000000	15.000000	1.000000
25%	50.750000	28.750000	41.500000	34.750000
50%	100.500000	36.000000	61.500000	50.000000
75%	150.250000	49.000000	78.000000	73.000000
max	200.000000	70.000000	137.000000	99.000000

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40