# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
# Load the library
import numpy as np
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
import seaborn as sns
plt.style.use('fivethirtyeight')
# Load the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()
# Let's rename few columns
df.rename(columns={'Annual Income (k$)':'Income',
'Spending Score (1-100)':'Spending_score'}, inplace=True)
dat = ff.create_table(df.head())
py.iplot(dat)
# describing the data
df.describe()
# describing the data
desc = ff.create_table(df.describe())
py.iplot(desc)
# How many missing values are in each column and of what data types they are?
print(); print("Database has {} observations (instances) and {} columns (attributes).".format(df.shape[0],df.shape[1]))
print(); print("Missing values in each column:\n{}".format(df.isnull().sum()))
print(); print("Columns data types:\n{}".format(df.dtypes))
## OR
# checking if there is any NULL data
res = df.isnull().any().any()
print();
print("Is there any NULL data in the dataframe: ", res)
# Generating Andrew's curves
plt.rcParams['figure.figsize'] = (12, 8)
plotting.andrews_curves(df.drop("CustomerID", axis=1), "Gender")
plt.title('Andrew Curves for Gender', fontsize = 10)
plt.grid()
plt.show()
The Andrews curves are able to preserve means, distance (up to a constant) and variances.
plt.rcParams['figure.figsize'] = (18, 4)
plt.subplot(1, 3, 1)
sns.set(style = 'whitegrid')
sns.distplot(df['Income'])
plt.title('Distribution of Annual Income', fontsize = 10)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')
plt.subplot(1, 3, 2)
sns.set(style = 'whitegrid')
sns.distplot(df['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 10)
plt.xlabel('Range of Age')
plt.ylabel('Count')
plt.show()
plt.subplot(1, 3, 3)
sns.set(style = 'whitegrid')
sns.distplot(df['Spending_score'], color = 'green')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.xlabel('Range of Spending Score')
plt.ylabel('Count')
plt.show()
# Check the distribution of Gender
labels = ['Female', 'Male']
size = df['Gender'].value_counts()
colors = ['lightgreen', 'orange']
explode = [0, 0.1]
plt.rcParams['figure.figsize'] = (6, 6)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()
By looking at the above pie chart which explains about the distribution of Gender in the Mall.
Interestingly, The Females are in the lead with a share of 56% whereas the Males have a share of 44%, that's a huge gap specially when the population of Males is comparatively higher than Females.
# Generate Histograms
plt.rcParams['figure.figsize'] = (15, 6)
sns.countplot(df['Age'], palette = 'hsv')
plt.title('Distribution of Age', fontsize = 15)
plt.show()
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Income'], palette = 'hsv')
plt.title('Distribution of Annual Income', fontsize = 10)
plt.show()
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(df['Spending_score'], palette = 'hsv')
plt.title('Distribution of Spending Score', fontsize = 10)
plt.show()
# Generate Pairwise plot
g = sns.pairplot(df[['Age', 'Income', 'Spending_score']])
#g.fig.suptitle("Pairplot for the Data")
plt.show()
# Generate the heatmap
data = df[['Age', 'Income', 'Spending_score']]
plt.rcParams['figure.figsize'] = (12, 6)
sns.heatmap(data.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()
# Gender vs Spendscore plot
plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Spending_score'], palette = 'Blues')
plt.title('Gender vs Spending Score', fontsize = 20)
plt.show()
# Gender vs Income plot
plt.rcParams['figure.figsize'] = (18, 7)
sns.boxenplot(df['Gender'], df['Income'], palette = 'Greens')
plt.title('Gender vs Income', fontsize = 20)
plt.show()
y = df['Income']
x = df['Age']
z = df['Spending_score']
sns.lineplot(x, y, color = 'blue')
sns.lineplot(x, z, color = 'pink')
plt.title('Age vs Income and Age vs Spending Score', fontsize = 20)
plt.show()
x = df.iloc[:, [3, 4]].values
# let's check the shape of x
print(x.shape)
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(x, method = 'ward'))
plt.title('Dendrogam', fontsize = 20)
plt.xlabel('Customers')
plt.ylabel('Ecuclidean Distance')
plt.show()