# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
# Load the library
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import plotly.graph_objects as go
# Load the dataset
df = pd.read_csv('OnlineRetail_sample.csv')
df.head()
df.shape
df.describe()
# How many missing values are in each column and of what data types they are?
print(); print("Database has {} observations (instances) and {} columns (attributes).".format(df.shape[0],df.shape[1]))
print(); print("Missing values in each column:\n{}".format(df.isnull().sum()))
print(); print("Columns data types:\n{}".format(df.dtypes))
# Drop the intances that have null values
df.dropna(subset=['CustomerID'], how='all', inplace= True)
df.shape
# considering the negative quantities as 0 for rfm analysis
df['Quantity']= list(map((lambda x: 0 if(x<0) else x),df['Quantity']))
# creating a new column called price which will quantity x unitprice
df['Price']= df['Quantity'] * df['UnitPrice']
df.head()
df.describe()
# Let's derive a separate column for Date from Invoice Date
df['date'] = pd.to_datetime(df['InvoiceDate']).dt.date
df
# Inorder to fix the date of analysis we will see the date range
print(df['date'].min(), '\t', df['date'].max())
# Let's consider the analysis date as 2012-01-01 and lets calculate the recency
import datetime as dt
df['no_of_days'] = (dt.date(2012,1,1))-df['date']
df['no_of_days'] = df['no_of_days'].astype('timedelta64[D]')
df.head()
rfm = df.groupby('CustomerID').agg({'no_of_days': lambda x:x.min(), \
'CustomerID': lambda x: len(x), \
'Price': lambda x: x.sum()})
rfm.rename(columns={'no_of_days': 'recency', 'CustomerID': 'frequency', 'Price': 'monetary'}, inplace=True)
rfm.head()
rfm.describe()
rfm["recency_score"] = pd.cut(rfm["recency"], bins=[-1,np.percentile(rfm["recency"], 25),np.percentile(rfm["recency"], 50),
np.percentile(rfm["recency"], 75), rfm["recency"].max()],
labels=[4, 3, 2, 1]).astype("int")
rfm["frequency_score"] = pd.cut(rfm["frequency"], bins=[-1,np.percentile(rfm["frequency"], 25),
np.percentile(rfm["frequency"], 50),
np.percentile(rfm["frequency"], 75), rfm["frequency"].max()],
labels=[1, 2, 3, 4]).astype("int")
rfm["monetary_score"] = pd.cut(rfm["monetary"], bins=[-1,np.percentile(rfm["monetary"], 25),
np.percentile(rfm["monetary"], 50),
np.percentile(rfm["monetary"], 75), rfm["monetary"].max()],
labels=[1, 2, 3, 4])
rfm.head()
rfm['RFMClass']=0
rfm['Customer_Seg']=''
j=0
while j <= (len(rfm)-1):
rfm['RFMClass'][j] = ((rfm['recency_score'].iloc[j])*100) + \
((rfm['frequency_score'].iloc[j])*10) + \
(rfm['monetary_score'].iloc[j])
j+=1
from tabulate import tabulate
# ---------------------------------------------------------
# Create a Reference table for Rule based segmentation
# ---------------------------------------------------------
table= [['R','M','F1','F2','F3','F4'],
['4','4','Unsteady','Active','Top customer','Top customer'],
[' ','3','Unsteady','Active','Top customer','Top customer'],
[' ','2','Unsteady','Active','Active','Active'],
[' ','1','Unsteady','Active','Active','Active'],
['3','4','Unsteady','Emerging','Emerging','Emerging'],
[' ','3','Unsteady','Emerging','Emerging','Emerging'],
[' ','2','Unsteady','Unsteady','Unsteady','Unsteady'],
[' ','1','Unsteady','Unsteady','Unsteady','Unsteady'],
['2','4','At risk','At risk','At risk','At risk'],
[' ',3,'At risk','At risk','At risk','At risk'],
[' ','2','Potentially lost','Potentially lost','At risk','At risk'],
[' ','2','Potentially lost','Potentially lost','At risk','At risk'],
[' ','1','Potentially lost','Potentially lost','At risk','At risk'],
['1','4','Inactive','Inactive','At risk','At risk'],
[' ','3','Inactive','Inactive','At risk','At risk'],
[' ','2','Lost','Lost','At risk','At risk'],
[' ','1','Lost','Lost','At risk','At risk']]
print('Customer Segmentation: ')
print(tabulate(table,headers='firstrow', tablefmt='fancy_grid'))
# Assign a segment based on RFM
k=0
while k <= (len(rfm)-1):
if rfm['recency_score'].iloc[k]== 4 and rfm['frequency_score'].iloc[k]>=3 and rfm['monetary_score'].iloc[k]>=3:
rfm['Customer_Seg'].iloc[k]= 'Top Customers'
elif rfm['recency_score'].iloc[k]== 4 and rfm['frequency_score'].iloc[k]>=1 and rfm['monetary_score'].iloc[k]>=1:
rfm['Customer_Seg'].iloc[k]= 'Active Customers'
elif rfm['recency_score'].iloc[k]== 3 and rfm['frequency_score'].iloc[k]>=2 and rfm['monetary_score'].iloc[k]>=3:
rfm['Customer_Seg'].iloc[k]= 'Emerging Customers'
elif rfm['recency_score'].iloc[k]>= 3 and rfm['frequency_score'].iloc[k]==1 and rfm['monetary_score'].iloc[k]>=1:
rfm['Customer_Seg'].iloc[k]= 'Unsteady Customers'
elif rfm['recency_score'].iloc[k]== 3 and rfm['frequency_score'].iloc[k]>=2 and rfm['monetary_score'].iloc[k]<=2:
rfm['Customer_Seg'].iloc[k]= 'Unsteady Customers'
elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]>=1 and rfm['monetary_score'].iloc[k]>=3:
rfm['Customer_Seg'].iloc[k]= 'At Risk'
elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]>=3 and rfm['monetary_score'].iloc[k]<=2:
rfm['Customer_Seg'].iloc[k]= 'At Risk'
elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]>=3 and rfm['monetary_score'].iloc[k]>=1:
rfm['Customer_Seg'].iloc[k]= 'At Risk'
elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]<=2 and rfm['monetary_score'].iloc[k]<=2:
rfm['Customer_Seg'].iloc[k]= 'Potentially lost'
elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]<=2 and rfm['monetary_score'].iloc[k]>=3:
rfm['Customer_Seg'].iloc[k]= 'Inactive Customers'
elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]<=2 and rfm['monetary_score'].iloc[k]<=2:
rfm['Customer_Seg'].iloc[k]= 'Lost'
else:
rfm['Customer_Seg'].iloc[k]= 'Uncategorized'
k+=1
rfm.head()
data = rfm.groupby("Customer_Seg")["RFMClass"].count()
data
# Visualisation
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(10,10))
data.plot.pie(autopct="%.1f%%")
Observation: Significant number of customers have been lost and are at Risk
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(18,5))
sns.barplot(y='frequency', x= 'Customer_Seg', data= rfm)
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(18,5))
sns.barplot(y='frequency',x= 'Customer_Seg', data= rfm)
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(18,5))
sns.barplot(y='monetary',x= 'Customer_Seg', data= rfm)