Applied Data Science Notebook in Python for Beginners to Professionals¶

An end-to-end tutorials on Cluster Analysis - Applied Machine Learning & Data Science¶

Clustering Analysis & Visualisation in Python¶

# Suppress warnings in Jupyter Notebooks

import warnings
warnings.filterwarnings("ignore")

# Load the library
import numpy as np
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
import seaborn as sns
plt.style.use('fivethirtyeight')

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()

# Let's rename few columns
df.rename(columns={'Annual Income (k$)':'Income',
                   'Spending Score (1-100)':'Spending_score'}, inplace=True)

dat = ff.create_table(df.head())
py.iplot(dat)

# describing the data
df.describe()

# describing the data
desc = ff.create_table(df.describe())
py.iplot(desc)

# How many missing values are in each column and of what data types they are?
print(); print("Database has {} observations (instances) and {} columns (attributes).".format(df.shape[0],df.shape[1]))
print(); print("Missing values in each column:\n{}".format(df.isnull().sum()))
print(); print("Columns data types:\n{}".format(df.dtypes))

## OR

# checking if there is any NULL data
res = df.isnull().any().any()
print();
print("Is there any NULL data in the dataframe: ", res)

Database has 200 observations (instances) and 5 columns (attributes).

Missing values in each column:
CustomerID        0
Gender            0
Age               0
Income            0
Spending_score    0
dtype: int64

Columns data types:
CustomerID         int64
Gender            object
Age                int64
Income             int64
Spending_score     int64
dtype: object

Is there any NULL data in the dataframe:  False

Data Visualisation¶

# Generating Andrew's curves

plt.rcParams['figure.figsize'] = (12, 8)

plotting.andrews_curves(df.drop("CustomerID", axis=1), "Gender")
plt.title('Andrew Curves for Gender', fontsize = 10)
plt.grid()
plt.show()

The Andrews curves are able to preserve means, distance (up to a constant) and variances.

plt.rcParams['figure.figsize'] = (18, 4)

plt.subplot(1, 3, 1)
sns.set(style = 'whitegrid')
sns.distplot(df['Income'])
plt.title('Distribution of Annual Income', fontsize = 10)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')


plt.subplot(1, 3, 2)
sns.set(style = 'whitegrid')
sns.distplot(df['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 10)
plt.xlabel('Range of Age')
plt.ylabel('Count')
plt.show()

plt.subplot(1, 3, 3)
sns.set(style = 'whitegrid')
sns.distplot(df['Spending_score'], color = 'green')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.xlabel('Range of Spending Score')
plt.ylabel('Count')
plt.show()

# Check the distribution of Gender

labels = ['Female', 'Male']
size = df['Gender'].value_counts()
colors = ['lightgreen', 'orange']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (6, 6)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

By looking at the above pie chart which explains about the distribution of Gender in the Mall.

Interestingly, The Females are in the lead with a share of 56% whereas the Males have a share of 44%, that's a huge gap specially when the population of Males is comparatively higher than Females.

# Generate Histograms

plt.rcParams['figure.figsize'] = (15, 6)
sns.countplot(df['Age'], palette = 'hsv')
plt.title('Distribution of Age', fontsize = 15)
plt.show()

plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Income'], palette = 'hsv')
plt.title('Distribution of Annual Income', fontsize = 10)
plt.show()

plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Spending_score'], palette = 'hsv')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.show()

# Generate Pairwise plot
g = sns.pairplot(df[['Age', 'Income', 'Spending_score']])
#g.fig.suptitle("Pairplot for the Data")
plt.show()

# Generate the heatmap
data = df[['Age', 'Income', 'Spending_score']]

plt.rcParams['figure.figsize'] = (12, 6)
sns.heatmap(data.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()

#  Gender vs Spendscore plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Spending_score'], palette = 'Blues')
plt.title('Gender vs Spending Score', fontsize = 20)
plt.show()

#  Gender vs Income plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Income'], palette = 'Greens')
plt.title('Gender vs Income', fontsize = 20)
plt.show()

y = df['Income']
x = df['Age']
z = df['Spending_score']

sns.lineplot(x, y, color = 'blue')
sns.lineplot(x, z, color = 'pink')
plt.title('Age vs Income and Age vs Spending Score', fontsize = 20)
plt.show()

Cluster Analysis¶

x = df.iloc[:, [3, 4]].values

# let's check the shape of x
print(x.shape)

(200, 2)

Using Dendrograms to find the no. of Optimal Clusters¶

import scipy.cluster.hierarchy as sch

dendrogram = sch.dendrogram(sch.linkage(x, method = 'ward'))
plt.title('Dendrogam', fontsize = 20)
plt.xlabel('Customers')
plt.ylabel('Ecuclidean Distance')
plt.show()

	CustomerID	Age	Income	Spending_score
count	200.000000	200.000000	200.000000	200.000000
mean	100.500000	38.850000	60.560000	50.200000
std	57.879185	13.969007	26.264721	25.823522
min	1.000000	18.000000	15.000000	1.000000
25%	50.750000	28.750000	41.500000	34.750000
50%	100.500000	36.000000	61.500000	50.000000
75%	150.250000	49.000000	78.000000	73.000000
max	200.000000	70.000000	137.000000	99.000000

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40