Applied Data Science Notebook in Python for Beginners to Professionals

An end-to-end tutorials on Cluster Analysis - Applied Machine Learning & Data Science

Clustering Analysis & Visualisation in Python

In [1]:
# Suppress warnings in Jupyter Notebooks

import warnings
warnings.filterwarnings("ignore")
In [2]:
# Load the library
import numpy as np
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
import seaborn as sns
plt.style.use('fivethirtyeight')
In [3]:
# Load the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()
Out[3]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [4]:
# Let's rename few columns
df.rename(columns={'Annual Income (k$)':'Income',
                   'Spending Score (1-100)':'Spending_score'}, inplace=True)
In [5]:
dat = ff.create_table(df.head())
py.iplot(dat)
In [6]:
# describing the data
df.describe()
Out[6]:
CustomerID Age Income Spending_score
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
In [7]:
# describing the data
desc = ff.create_table(df.describe())
py.iplot(desc)
In [8]:
# How many missing values are in each column and of what data types they are?
print(); print("Database has {} observations (instances) and {} columns (attributes).".format(df.shape[0],df.shape[1]))
print(); print("Missing values in each column:\n{}".format(df.isnull().sum()))
print(); print("Columns data types:\n{}".format(df.dtypes))

## OR

# checking if there is any NULL data
res = df.isnull().any().any()
print();
print("Is there any NULL data in the dataframe: ", res)
Database has 200 observations (instances) and 5 columns (attributes).

Missing values in each column:
CustomerID        0
Gender            0
Age               0
Income            0
Spending_score    0
dtype: int64

Columns data types:
CustomerID         int64
Gender            object
Age                int64
Income             int64
Spending_score     int64
dtype: object

Is there any NULL data in the dataframe:  False

Data Visualisation

In [9]:
# Generating Andrew's curves

plt.rcParams['figure.figsize'] = (12, 8)

plotting.andrews_curves(df.drop("CustomerID", axis=1), "Gender")
plt.title('Andrew Curves for Gender', fontsize = 10)
plt.grid()
plt.show()

The Andrews curves are able to preserve means, distance (up to a constant) and variances.

In [10]:
plt.rcParams['figure.figsize'] = (18, 4)

plt.subplot(1, 3, 1)
sns.set(style = 'whitegrid')
sns.distplot(df['Income'])
plt.title('Distribution of Annual Income', fontsize = 10)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')


plt.subplot(1, 3, 2)
sns.set(style = 'whitegrid')
sns.distplot(df['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 10)
plt.xlabel('Range of Age')
plt.ylabel('Count')
plt.show()

plt.subplot(1, 3, 3)
sns.set(style = 'whitegrid')
sns.distplot(df['Spending_score'], color = 'green')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.xlabel('Range of Spending Score')
plt.ylabel('Count')
plt.show()
In [11]:
# Check the distribution of Gender

labels = ['Female', 'Male']
size = df['Gender'].value_counts()
colors = ['lightgreen', 'orange']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (6, 6)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

By looking at the above pie chart which explains about the distribution of Gender in the Mall.

Interestingly, The Females are in the lead with a share of 56% whereas the Males have a share of 44%, that's a huge gap specially when the population of Males is comparatively higher than Females.

In [12]:
# Generate Histograms
In [13]:
plt.rcParams['figure.figsize'] = (15, 6)
sns.countplot(df['Age'], palette = 'hsv')
plt.title('Distribution of Age', fontsize = 15)
plt.show()
In [14]:
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Income'], palette = 'hsv')
plt.title('Distribution of Annual Income', fontsize = 10)
plt.show()
In [15]:
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Spending_score'], palette = 'hsv')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.show()
In [16]:
# Generate Pairwise plot
g = sns.pairplot(df[['Age', 'Income', 'Spending_score']])
#g.fig.suptitle("Pairplot for the Data")
plt.show()
In [17]:
# Generate the heatmap
data = df[['Age', 'Income', 'Spending_score']]

plt.rcParams['figure.figsize'] = (12, 6)
sns.heatmap(data.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()
In [18]:
#  Gender vs Spendscore plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Spending_score'], palette = 'Blues')
plt.title('Gender vs Spending Score', fontsize = 20)
plt.show()
In [19]:
#  Gender vs Income plot

plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Income'], palette = 'Greens')
plt.title('Gender vs Income', fontsize = 20)
plt.show()
In [20]:
y = df['Income']
x = df['Age']
z = df['Spending_score']

sns.lineplot(x, y, color = 'blue')
sns.lineplot(x, z, color = 'pink')
plt.title('Age vs Income and Age vs Spending Score', fontsize = 20)
plt.show()

Cluster Analysis

In [21]:
x = df.iloc[:, [3, 4]].values

# let's check the shape of x
print(x.shape)
(200, 2)

Using Dendrograms to find the no. of Optimal Clusters

In [22]:
import scipy.cluster.hierarchy as sch

dendrogram = sch.dendrogram(sch.linkage(x, method = 'ward'))
plt.title('Dendrogam', fontsize = 20)
plt.xlabel('Customers')
plt.ylabel('Ecuclidean Distance')
plt.show()
In [ ]: