Applied Data Science Notebook in Python for Beginners to Professionals¶

An end-to-end tutorials on Clustering - Applied Machine Learning & Data Science¶

Clustering, Regression and Data Visualisation using Plotly in Python¶

# Suppress warnings in Jupyter Notebooks

import warnings
warnings.filterwarnings("ignore")

In this end-to-end tutorials custering & regression, medical cost patient dataset has been used. The first step is to read necessary libraries.

The following libraries are used: pandas - to manipulate data frames numpy - providing linear algebra seaborm - to create nice visualizations matplotlib - basic tools for visualizations scikit-learn - machine learning library

import numpy as np
import pandas as pd

# Plotly Packages
import plotly
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from string import ascii_letters

# Statistical Libraries
from scipy.stats import norm
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats

# Regression Modeling
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Load dataset
df = pd.read_csv("insurance.csv")

# Let's store the original dataframe in another variable.
original_df = df.copy()

df.head()

dat = ff.create_table(df.head())
iplot(dat)

Distribution of Medical Charges ($Cost)¶

Types of Distributions : We have a right skewed distribution in which most patients are being charged between 2000− 12000.
Using Logarithms : Logarithms helps us have a normal distribution which could help us in a number of different ways such as outlier detection, implementation of statistical concepts based on the central limit theorem and for our predictive model in the foreseen future.

# Determine the distribution of charge
charge_dist = df["charges"].values
logcharge = np.log(df["charges"])

trace0 = go.Histogram(
    x=charge_dist,
    histnorm='probability',
    name="Charges Distribution",
    marker = dict(
        color = '#FA5858',
    )
)

trace1 = go.Histogram(
    x=logcharge,
    histnorm='probability',
    name="Charges Distribution using Log",
    marker = dict(
        color = '#58FA82',
    )
)

fig = plotly.subplots.make_subplots(rows=2, cols=1,
                          subplot_titles=('Distribution of Charge (Original Data)','Distribution of Charge (Log transform)'),
                          print_grid=False)

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)

fig['layout'].update(showlegend=True, title='Distribution of Charges ($Cost)', bargap=0.05)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

Age Analysis¶

Turning Age into Categorical Variables: Young Adult: from 18 - 35 Senior Adult: from 36 - 55 Elder: 56 or older Share of each Category: Young Adults (42.9%), Senior Adults (41%) and Elder (16.1%)

df['age_cat'] = np.nan
lst = [df]

for col in lst:
    col.loc[(col['age'] > 17)  & (col['age'] <= 35), 'age_cat'] = 'Young Adult'
    col.loc[(col['age'] > 35)  & (col['age'] <= 55), 'age_cat'] = 'Senior Adult'
    col.loc[ col['age'] > 55, 'age_cat'] = 'Elder'
    
labels = df["age_cat"].unique().tolist()
amount = df["age_cat"].value_counts().tolist()

colors = ["#ff9999", "#b3d9ff", " #e6ffb3"]

trace = go.Pie(labels=labels, values=amount,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

data = [trace]
layout = go.Layout(title="Amount by Age Category")

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic_pie_chart')

Is there a Relationship between BMI and Age?

BMI frequency: Most of the BMI frequency is concentrated between 27 - 33.
Correlations Age and charges have a correlation of 0.29 while bmi and charges have a correlation of 0.19
Relationship betweem BMI and Age: The correlation for these two variables is 0.10 which is not that great. Therefore, we can disregard that age has a huge influence on BMI.

bmi = [df["bmi"].values.tolist()]
group_labels = ['Body Mass Index Distribution']

colors = ['#FA5858']

fig = ff.create_distplot(bmi, group_labels, colors=colors)
fig['layout'].update(title='Normal Distribution <br> Central Limit Theorem Condition') # Add title

iplot(fig, filename='Basic Distplot')

# Correlation heatmap
corr = df.corr()
print(corr)

hm = go.Heatmap(
    z=corr.values,
    x=corr.index.values.tolist(),
    y=corr.index.values.tolist()
)

data = [hm]
layout = go.Layout(title="Correlation Heatmap")

fig = dict(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')

               age       bmi  children   charges
age       1.000000  0.109272  0.042469  0.299008
bmi       0.109272  1.000000  0.012759  0.198341
children  0.042469  0.012759  1.000000  0.067998
charges   0.299008  0.198341  0.067998  1.000000

# Body Mass Index by Age Category

young_adults = df["bmi"].loc[df["age_cat"] == "Young Adult"].values
senior_adult = df["bmi"].loc[df["age_cat"] == "Senior Adult"].values
elders = df["bmi"].loc[df["age_cat"] == "Elder"].values

trace0 = go.Box(
    y=young_adults,
    name = 'Young Adults',
    boxmean= True,
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)

trace1 = go.Box(
    y=senior_adult,
    name = 'Senior Adults',
    boxmean= True,
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)

trace2 = go.Box(
    y=elders,
    name = 'Elders',
    boxmean= True,
    marker = dict(
        color = 'rgb(247, 186, 166)',
    )
)

data = [trace0, trace1, trace2]

layout = go.Layout(title="Body Mass Index <br> by Age Category", xaxis=dict(title="Age Category", titlefont=dict(size=16)),
                  yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

df.head()

# Body Mass Index by Gender Category

female = df["bmi"].loc[df["gender"] == "female"].values
male   = df["bmi"].loc[df["gender"] == "male"].values

trace0 = go.Box(
    y=female,
    name = 'Female',
    boxmean= True,
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)

trace1 = go.Box(
    y=male,
    name = 'Male',
    boxmean= True,
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)

data = [trace0, trace1]

layout = go.Layout(title="Body Mass Index <br> by Gender Category", xaxis=dict(title="Gender Category", 
                                                                               titlefont=dict(size=16)),
                  yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Comparing Independent Categorical Variables to do ANOVA tests

(a) P-value: The p-value being higher than 0.05 tells us that we take the Null hypothesis, meaning that there is no a significant change between the three age categories when it comes to Body Mass Index.
(b) P-value: The p-value being higher than 0.05 tells us that we take the Null hypothesis, meaning that there is no a significant change between the two gender categories when it comes to Body Mass Index.

# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols

moore_lm = ols("bmi ~ age_cat", data=df).fit()
print(moore_lm.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    bmi   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     5.949
Date:                Fri, 06 Aug 2021   Prob (F-statistic):            0.00268
Time:                        15:55:57   Log-Likelihood:                -4311.2
No. Observations:                1338   AIC:                             8628.
Df Residuals:                    1335   BIC:                             8644.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                  31.7393      0.413     76.776      0.000      30.928      32.550
age_cat[T.Senior Adult]    -0.9202      0.488     -1.885      0.060      -1.878       0.037
age_cat[T.Young Adult]     -1.6295      0.485     -3.360      0.001      -2.581      -0.678
==============================================================================
Omnibus:                       19.635   Durbin-Watson:                   2.098
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               20.284
Skew:                           0.301   Prob(JB):                     3.94e-05
Kurtosis:                       2.981   Cond. No.                         5.27
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols

moore_lm = ols("bmi ~ gender", data=df).fit()
print(moore_lm.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    bmi   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.879
Date:                Fri, 06 Aug 2021   Prob (F-statistic):             0.0900
Time:                        15:55:57   Log-Likelihood:                -4315.7
No. Observations:                1338   AIC:                             8635.
Df Residuals:                    1336   BIC:                             8646.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         30.3777      0.237    128.259      0.000      29.913      30.842
gender[T.male]     0.5654      0.333      1.697      0.090      -0.088       1.219
==============================================================================
Omnibus:                       17.480   Durbin-Watson:                   2.087
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               18.017
Skew:                           0.282   Prob(JB):                     0.000122
Kurtosis:                       2.937   Cond. No.                         2.63
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Body Mass Index of Smokers Status by Age Category
import plotly.graph_objs as go

ya_smoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "yes")].values
sa_smoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "yes")].values
e_smoker  = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "yes")].values

# Non-Smokers
ya_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "no")].values
sa_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "no")].values
e_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "no")].values

x_data = ['Young A. Smoker',  'Young A. Non-Smoker',
          'Senior A. Smoker', 'Senior A. Non-Smoker',
          'Elder Smoker',     'Elder Non-Smoker',]

y0 = ya_smoker
y1 = ya_nonsmoker
y2 = sa_smoker
y3 = sa_nonsmoker
y4 = e_smoker
y5 = e_nonsmoker

y_data = [y0,y1,y2,y3,y4,y5]

colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
          'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
          'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']

traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='Body Mass Index of Smokers Status by Age Category',
    xaxis=dict(
    title="Status",
    titlefont=dict(
    size=16)),
    yaxis=dict(
        title="Body Mass Index",
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
        titlefont=dict(
        size=16)
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 243, 192)',
    showlegend=False
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig)

# Body Mass Index of Gender Status by Age Category
import plotly.graph_objs as go

ya_male = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].values
sa_male = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].values
e_male  = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].values

# Female
ya_female = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].values
sa_female = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].values
e_female  = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].values

x_data = ['Young A. Male',  'Young A. Female',
          'Senior A. Male', 'Senior A. Female',
          'Elder Male',     'Elder Female',]

y0 = ya_male
y1 = ya_female
y2 = sa_male
y3 = sa_female
y4 = e_male
y5 = e_female

y_data = [y0,y1,y2,y3,y4,y5]

colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
          'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
          'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']

traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='Body Mass Index of Gender Status by Age Category',
    xaxis=dict(
    title="Status",
    titlefont=dict(
    size=16)),
    yaxis=dict(
        title="Body Mass Index",
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
        titlefont=dict(
        size=16)
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 243, 192)',
    showlegend=False
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig)

Who got charged more on Average by Age¶

Who got charged more on Average by Age?

Patient Charge Mean: For young adults it is 7,944, for Senior Adults it is 14,785 and for the elder it is 18,795.
Patient Charge Median: For young adults it is 4,252, for Senior Adults it is 9,565 and for the elder it is 13,429.
Mean and the Median: Sometimes we must be careful when using the mean since it is prone to be affected by outliers.

# Mean could be affected easily by outliers or extreme cases.

# Means
avg_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].mean()
avg_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].mean()
avg_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].mean()

# Median
med_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].median()
med_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].median()
med_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].median()

average_plot = go.Bar(
    x=['Young Adults', 'Senior Adults', 'Elder'],
    y=[avg_ya_charge, avg_sa_charge, avg_e_charge],
    name='Mean',
    marker=dict(
        color="#F5B041"
    )
)
med_plot = go.Bar(
    x=['Young Adults', 'Senior Adults', 'Elder'],
    y=[med_ya_charge, med_sa_charge, med_e_charge],
    name='Median',
    marker=dict(
        color="#48C9B0"
    )
)

fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
                          subplot_titles=('Average Charge by Age','Median Charge by Age'),
                         shared_yaxes=True, print_grid=False)


fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)

fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""), 
                                                           yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

# Mean could be affected easily by outliers or extreme cases.

# Means for male and female
avg_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult")  & (df["gender"] == "male")].mean()
avg_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].mean()
avg_e_charge_male  = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].mean()

avg_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult")  & (df["gender"] == "female")].mean()
avg_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].mean()
avg_e_charge_female  = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].mean()

# Median
med_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].median()
med_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].median()
med_e_charge_male  = df["charges"].loc[(df["age_cat"] == "Elder")  & (df["gender"] == "male")].median()

med_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].median()
med_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].median()
med_e_charge_female  = df["charges"].loc[(df["age_cat"] == "Elder")  & (df["gender"] == "female")].median()

average_plot = go.Bar(
    x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
       'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
    y=[avg_ya_charge_male, avg_sa_charge_male, avg_e_charge_male,
       avg_ya_charge_female, avg_sa_charge_female, avg_e_charge_female],
    name='Mean',
    marker=dict(
        color="#F5B041"
    )
)

med_plot = go.Bar(
    x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
       'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
    y=[med_ya_charge_male, med_sa_charge_male, med_e_charge_male,
       med_ya_charge_female, med_sa_charge_female, med_e_charge_female],
    name='Median',
    marker=dict(
        color="#48C9B0"
    )
)

fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
                          subplot_titles=('Average Charge by Age','Median Charge by Age'),
                         shared_yaxes=True, print_grid=False)


fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)

fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""), 
                                                           yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

Weight Status: https://www.cancer.org/cancer/cancer-causes/diet-physical-activity/body-weight-and-cancer-risk/adult-bmi.html

Turning BMI into Categorical Variables:

Under Weight: Body Mass Index (BMI) < 18.5
Normal Weight: Body Mass Index (BMI) ≥ 18.5 and Body Mass Index (BMI) < 24.9
Overweight: Body Mass Index (BMI) ≥ 25 and Body Mass Index (BMI) < 29.9
Obese: Body Mass Index (BMI) > 30

df["weight_condition"] = np.nan
lst = [df]

for col in lst:
    col.loc[ col["bmi"] <  18.5, "weight_condition"] = "Underweight"
    col.loc[(col["bmi"] >= 18.5) & (col["bmi"] < 24.986), "weight_condition"] = "Normal Weight"
    col.loc[(col["bmi"] >= 25)   & (col["bmi"] < 29.926), "weight_condition"] = "Overweight"
    col.loc[ col["bmi"] >= 30,  "weight_condition"] = "Obese"
    
df.head()

dat = ff.create_table(df.head())
iplot(dat)

# Create subpplots
f, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(18,8))

# I wonder if the cluster that is on the top is from obese people
sns.stripplot(x="age_cat", y="charges", data=df, ax=ax1, linewidth=1, palette="Reds")
ax1.set_title("Relationship between Charges and Age")


sns.stripplot(x="age_cat", y="charges", hue="weight_condition", data=df, ax=ax2, linewidth=1, palette="Set2")
ax2.set_title("Relationship of Weight Condition, Age and Charges")

sns.stripplot(x="smoker", y="charges", hue="weight_condition", data=df, ax=ax3, linewidth=1, palette="Set2")
ax3.legend_.remove()
ax3.set_title("Relationship between Smokers and Charges")

plt.show()

# Make sure we don't have any null values
df[df.isnull().any(axis=1)]

Weight Status vs Charges

Overweight: Notice how there are two groups of people that get significantly charged more than the other group of overweight people.
Obese: Same thing goes with the obese group, were a significant group is charged more than the other group.

fig = ff.create_facet_grid(
    df,
    x='age',
    y='charges',
    color_name='weight_condition',
    show_boxes=False,
    marker={'size': 10, 'opacity': 1.0},
    colormap={'Underweight': 'rgb(208, 246, 130)', 'Normal Weight': 'rgb(166, 246, 130)',
             'Overweight': 'rgb(251, 232, 238)', 'Obese': 'rgb(253, 45, 28)'}
)

fig['layout'].update(title="Weight Status vs Charges", title_x = 0.5,
                     width=800, height=600, plot_bgcolor='rgb(251, 251, 251)', 
                     paper_bgcolor='rgb(255, 255, 255)')
iplot(fig, filename='facet - custom colormap')

# First find the average or median of the charges obese people paid.

obese_avg = df["charges"].loc[df["weight_condition"] == "Obese"].mean()

df["charge_status"] = np.nan
lst = [df]

for col in lst:
    col.loc[col["charges"] > obese_avg, "charge_status"] = "Above Average"
    col.loc[col["charges"] < obese_avg, "charge_status"] = "Below Average"
    
dat = ff.create_table(df.head())
iplot(dat)

Obesity and the Impact of Smoking to the Wallet:

Notice in the charges box how smoking looks to have a certain impact on medical costs. Let's find out how much of a difference there is between the group of obese patients that smoke compared to the group of obese patients that don't smoke.

import seaborn as sns
sns.set(style="ticks")
pal = ["#FA5858", "#58D3F7"]

sns.pairplot(df, hue="smoker", palette=pal)
plt.title("Smokers")

Text(0.5, 1.0, 'Smokers')

# What Percentage of Obese that Smoked Paid aBove Average from the total obese patients?

# 79% of Obese were non-smokers while the 21% left were smokers
total_obese = len(df.loc[df["weight_condition"] == "Obese"])

obese_smoker_prop = len(df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")])/total_obese
obese_smoker_prop = round(obese_smoker_prop, 2)

obese_nonsmoker_prop = len(df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")])/total_obese
obese_nonsmoker_prop = round(obese_nonsmoker_prop, 2)


# Average charge by obese_smokers and obese_nonsmoker
charge_obese_smoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")].mean()
charge_obese_nonsmoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")].mean()

Distribution of Charges (Obese Smoker vs Obese non-Smoker)

Violin Plots: We will be using violin plots to compare the distributions of patients of the obese group who are smokers and non-smokers.
Obese smokers distribution: Most obese smokers pay around 40k in medical costs!
Obese non-smokers distribution: Most obese non-smokers pay 8k in medical costs.
Smoking a factor to the wallet Smoking is defninitely a big factor for obese patients when it comes to medical cost. A difference of more than 30k!

pointspossmoker = [-0.9,-1.1,-0.6,-0.3]
pointposnonsmoker = [0.45,0.55,1,0.4]
showLegend = [True,False,False,False]

data = []
for i in range(0,len(pd.unique(df['weight_condition']))):
    male = {
            "type": 'violin',
            "x": df['weight_condition'][(df['smoker'] == 'yes') & 
                                        (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
            "y": df['charges'][ (df['smoker'] == 'yes') & (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
            "legendgroup": 'Smoker',
            "scalegroup": 'Smoker',
            "name": 'Smoker',
            "side": 'negative',
            "box": {
                "visible": True
            },
            "points": 'all',
            "pointpos": pointspossmoker[i],
            "jitter": 0,
            "scalemode": 'count',
            "meanline": {
                "visible": True
            },
            "line": {
                "color": '#DF0101'
            },
            "marker": {
                "line": {
                    "width": 2,
                    "color": '#F78181'
                }
            },
            "span": [
                0
            ],
            "showlegend": showLegend[i]
        }
    data.append(male)
    female = {
            "type": 'violin',
            "x": df['weight_condition'] [(df['smoker'] == 'no') & 
                                         (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
            "y": df['charges'] [ (df['smoker'] == 'no') & (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
            "legendgroup": 'Non-Smoker',
            "scalegroup": 'Non-Smoker',
            "name": 'Non-Smoker',
            "side": 'positive',
            "box": {
                "visible": True
            },
            "points": 'all',
            "pointpos": pointposnonsmoker[i],
            "jitter": 0,
            "scalemode": 'count',
            "meanline": {
                "visible": True
            },
            "line": {
                "color": '#00FF40'
            },
            "marker": {
                "line": {
                    "width": 2,
                    "color": '#81F781'
                }
            },
            "span": [
                0
            ],
            "showlegend": showLegend[i]
        }
    data.append(female)
        

fig = {
    "data": data,
    "layout" : {
        "title": "Charges Distribution of Obese Patients<br><i>Group by Smoking Status",
        "yaxis": {
            "zeroline": False,
            "title": "Patient Charges",
            "titlefont": {
                "size": 16
            }
        },
        "violingap": 0,
        "violingroupgap": 0,
        "violinmode": "overlay"
    }
}

iplot(fig, filename='violin/advanced', validate = False)

# we have to look closer into Obsese there is an obvious difference

chargedist_sm = df["charges"].loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")].values
chargedist_nsm = df["charges"].loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")].values

trace0 = go.Box(
    y=chargedist_sm,
    name = 'Obese Smokers',
    marker = dict(
        color = '#DF0101',
    )
)
trace1 = go.Box(
    y=chargedist_nsm,
    name = 'Obese Non-Smokers',
    marker = dict(
        color = '#00FF40',
    )
)

data = [trace0, trace1]

layout = dict(title="Deeper Look into Obese condition by Smoking status", title_x = 0.5,
             xaxis=dict(
             title="Status",
             titlefont=dict(
             size=16)),
             yaxis=dict(title="Patient Charges", 
                       titlefont=dict(size=16)),
              )
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Separation in Charges between Obese Smokers vs Non-Obese Smokers

In this chart we can visualize how can separate obese smokers and obese non-smokers into different clusters of groups. Therefore, we can say that smoking is a characteristic that definitely affects patient's charges.

# Create a Scatter Plot with all the Obese
obese_smoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")]
obese_nonsmoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")]

trace0 = go.Scatter(
    x = obese_smoker["age"].values,
    y = obese_smoker["charges"].values,
    name = 'Smokers',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = '#DF0101',
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    )
)

trace1 = go.Scatter(
    x = obese_nonsmoker["age"].values,
    y = obese_nonsmoker["charges"].values,
    name = 'Non-Smokers',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = '#00FF40',
        line = dict(
            width = 2,
        )
    )
)

data = [trace0, trace1]

layout = dict(title = 'Clear Separation between Obese Smokers and Non-Smokers in Charges', title_x = 0.5,
              yaxis = dict(zeroline = False,
                          title="Patient Charges",
                          titlefont=dict(size=16)),
              xaxis = dict(zeroline = False,
                          title="Age of the Patient",
                          titlefont=dict(
                          size=16))
             )

fig = dict(data=data, layout=layout)
iplot(fig, filename='styled-scatter')

Average Patient Charge by Region:

Median Patient Charges: The NorthEast is the region that pays the most on average while the SouthWest is the one that pays
Obese group: From the obese group, the Southwest is the region where obese patients pay the most..
Overweight: From the obese group, the NorthWest is the region where obese patients pay the most.
Normal Weight: From the obese group, the SouthEast is the region where obese patients pay the most.
Underweight: From the obese group, the NorthWest is the region where obese patients pay the most.

df.head()

# Average charge by Region
df["region"].unique()

# Median Charges per Region
southwest = np.median(df["charges"].loc[df["region"] == "southwest"].values)
southeast = np.median(df["charges"].loc[df["region"] == "southeast"].values)
northwest = np.median(df["charges"].loc[df["region"] == "northwest"].values)
northeast = np.median(df["charges"].loc[df["region"] == "northeast"].values)

lst = [southwest, southeast, northwest, northeast]

data = [go.Scatterpolar(
  r = [southwest, southeast, northwest, northeast],
  theta = ['SouthWest', 'SouthEast', 'NorthWest', 'NorthEast'],
  fill = 'toself'
)]

layout = go.Layout(
    title="Median Charged to Patients by Region",
    paper_bgcolor = "rgb(255, 255, 224)",
  polar = dict(
    radialaxis = dict(
      visible = True,
      range = [0, max(lst)]
    )
  ),
  showlegend = False
)


fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "radar/basic")

Average Charge by Region depending on the Weight Condition:

# Weight Condition by Region Radar plots

df["weight_condition"].unique()

# Average charges for overweight patients by region 
sw_overweight = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Overweight")].values)
se_overweight = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Overweight")].values)
nw_overweight = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Overweight")].values)
ne_overweight = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Overweight")].values)

# Obese
sw_obese = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Obese")].values)
se_obese = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Obese")].values)
nw_obese = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Obese")].values)
ne_obese = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Obese")].values)

# Normal Weight
sw_nw = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Normal Weight")].values)
se_nw = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Normal Weight")].values)
nw_nw = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Normal Weight")].values)
ne_nw = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Normal Weight")].values)

# Underweight
sw_uw = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Underweight")].values)
se_uw = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Underweight")].values)
nw_uw = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Underweight")].values)
ne_uw = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Underweight")].values)

# Labels
weight_labels = df["weight_condition"].unique().tolist()

# List per weight condition
sw_weights = [sw_overweight, sw_obese, sw_nw, sw_uw]
se_weights = [se_overweight, se_overweight, se_nw, se_uw]
nw_weights = [nw_overweight, nw_overweight, nw_nw, nw_uw]
ne_weights = [ne_overweight, ne_overweight, ne_nw, ne_uw]

data = [
    go.Scatterpolar(
        mode="lines+markers",
        r = sw_weights,
        theta = weight_labels,
        fill = 'toself',
        name="SouthWest",
        line=dict(
            color="rgba(0, 128, 128, 0.95)"
        ),
        marker=dict(
            color="rgba(0, 74, 147, 1)",
            symbol="square",
            size=8
        ),
        subplot = "polar"
    ),
    
    go.Scatterpolar(
        mode="lines+markers",
        r = se_weights,
        theta = weight_labels,
        fill = 'toself',
        name="SouthEast",
        line=dict(
            color="rgba(255, 72, 72, 0.95)"
        ),
        marker=dict(
            color="rgba(219, 0, 0, 1)",
            symbol="square",
            size=8
        ),
        subplot = "polar2"
    ),
    
    go.Scatterpolar(
        mode="lines+markers",
        r = nw_weights,
        theta = weight_labels,
        fill = 'toself',
        name="NorthWest",
        line=dict(
            color="rgba(72, 255, 72, 0.95)"
        ),
        marker=dict(
            color="rgba(0, 147, 74, 1)",
            symbol="square",
            size=8
        ),
        subplot = "polar3"
    ),
    
       go.Scatterpolar(
        mode="lines+markers",
        r = ne_weights,
        theta = weight_labels,
        fill = 'toself',
        name="NorthEast",
        line=dict(
            color="rgba(247, 133, 11, 0.95)"
        ),
        marker=dict(
            color="rgba(245, 168, 86, 1)",
            symbol="square",
            size=8
        ),
        subplot = "polar4"
    )
]

layout = go.Layout(
    title="Average Patient Charges <br> by Region <br>(Depending on the Patient's Weight Condition)", title_x = 0.5,
    showlegend = False,
    paper_bgcolor = "rgb(252, 234, 161)",
    polar = dict(
      domain = dict(
        x = [0, 0.46],
        y = [0.56, 1]
      ),
      radialaxis = dict(
        tickfont = dict(
          size = 6
        )
      ),
      angularaxis = dict(
        tickfont = dict(
          size = 8
        ),
        rotation = 40,
        direction = "clockwise"
      )
    ),
    polar2 = dict(
      domain = dict(
        x = [0, 0.46],
        y = [0, 0.44]
      ),
      radialaxis = dict(
        tickfont = dict(
          size = 6
        )
      ),
      angularaxis = dict(
        tickfont = dict(
          size = 8
        ),
        rotation = 40,
        direction = "clockwise"
      ),
    ),
    polar3 = dict(
      domain = dict(
       x = [0.54, 1],
        y = [0.56, 1]
      ),
      radialaxis = dict(
        tickfont = dict(
          size = 6
        )
      ),
      angularaxis = dict(
        tickfont = dict(
          size = 8
        ),
        rotation = 40,
        direction = "clockwise"
      ),
    ),
        polar4 = dict(
      domain = dict(
        x = [0.54, 1],
        y = [0, 0.44]
      ),
      radialaxis = dict(
        tickfont = dict(
          size = 6
        )
      ),
      angularaxis = dict(
        tickfont = dict(
          size = 8
        ),
        rotation = 40,
        direction = "clockwise"
      ),
    ))

fig = go.Figure(data=data, layout=layout)
fig['layout'].update(height=800, width=800)
iplot(fig, filename='polar/directions')

Unsupervised Learning:

Performing Clustering in a Manual Way: In the first plot we will do a cluster analysis in a manual form and see what our eyes can discover. Here are the following results from the manual cluster analysis performed.

Age and Charges: We can see there is a slight increase in charges depending on the age of the patient.
Obese Clusters: We can see that for each age group there are clusters of the obese group in the top part of charges.
Are these clusters Smokers? As seen in the right chart, most of this clusters are definitely smokers.

# Two subplots one with weight condition and the other with smoker.
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18,8))

sns.scatterplot(x="bmi", y="charges", hue="weight_condition", data=df, palette="Set1", ax=ax1)
ax1.set_title("Relationship between Charges and BMI by Weight Condition")
ax1.annotate('Obese Cluster \n (Does this cluster has \n the Smoking Attribute?)', xy=(37, 50000), xytext=(30, 60000),
            arrowprops=dict(facecolor='black'),
            fontsize=12)

sns.scatterplot(x="bmi", y="charges", hue="smoker", data=df, palette="Set1", ax=ax2)
ax2.set_title("Relationship between Charges and BMI by Smoking Condition")
ax2.annotate('Obese Smoker Cluster ', xy=(35, 48000), xytext=(20, 60000),
            arrowprops=dict(facecolor='black'),
            fontsize=12)
ax2.annotate('The Impact of Smoking to \n Charges on other \n Weight Conditions ', xy=(25, 26000), xytext=(17, 40000),
            arrowprops=dict(facecolor='black'),
            fontsize=12)

Text(17, 40000, 'The Impact of Smoking to \n Charges on other \n Weight Conditions ')

	age	gender	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520