# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
In this end-to-end tutorials custering & regression, medical cost patient dataset has been used. The first step is to read necessary libraries.
The following libraries are used: pandas - to manipulate data frames numpy - providing linear algebra seaborm - to create nice visualizations matplotlib - basic tools for visualizations scikit-learn - machine learning library
import numpy as np
import pandas as pd
# Plotly Packages
import plotly
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from string import ascii_letters
# Statistical Libraries
from scipy.stats import norm
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
# Regression Modeling
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# Load dataset
df = pd.read_csv("insurance.csv")
# Let's store the original dataframe in another variable.
original_df = df.copy()
df.head()
dat = ff.create_table(df.head())
iplot(dat)
# Determine the distribution of charge
charge_dist = df["charges"].values
logcharge = np.log(df["charges"])
trace0 = go.Histogram(
x=charge_dist,
histnorm='probability',
name="Charges Distribution",
marker = dict(
color = '#FA5858',
)
)
trace1 = go.Histogram(
x=logcharge,
histnorm='probability',
name="Charges Distribution using Log",
marker = dict(
color = '#58FA82',
)
)
fig = plotly.subplots.make_subplots(rows=2, cols=1,
subplot_titles=('Distribution of Charge (Original Data)','Distribution of Charge (Log transform)'),
print_grid=False)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig['layout'].update(showlegend=True, title='Distribution of Charges ($Cost)', bargap=0.05)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
Turning Age into Categorical Variables: Young Adult: from 18 - 35 Senior Adult: from 36 - 55 Elder: 56 or older Share of each Category: Young Adults (42.9%), Senior Adults (41%) and Elder (16.1%)
df['age_cat'] = np.nan
lst = [df]
for col in lst:
col.loc[(col['age'] > 17) & (col['age'] <= 35), 'age_cat'] = 'Young Adult'
col.loc[(col['age'] > 35) & (col['age'] <= 55), 'age_cat'] = 'Senior Adult'
col.loc[ col['age'] > 55, 'age_cat'] = 'Elder'
labels = df["age_cat"].unique().tolist()
amount = df["age_cat"].value_counts().tolist()
colors = ["#ff9999", "#b3d9ff", " #e6ffb3"]
trace = go.Pie(labels=labels, values=amount,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=20),
marker=dict(colors=colors,
line=dict(color='#000000', width=2)))
data = [trace]
layout = go.Layout(title="Amount by Age Category")
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic_pie_chart')
Is there a Relationship between BMI and Age?
bmi = [df["bmi"].values.tolist()]
group_labels = ['Body Mass Index Distribution']
colors = ['#FA5858']
fig = ff.create_distplot(bmi, group_labels, colors=colors)
fig['layout'].update(title='Normal Distribution <br> Central Limit Theorem Condition') # Add title
iplot(fig, filename='Basic Distplot')
# Correlation heatmap
corr = df.corr()
print(corr)
hm = go.Heatmap(
z=corr.values,
x=corr.index.values.tolist(),
y=corr.index.values.tolist()
)
data = [hm]
layout = go.Layout(title="Correlation Heatmap")
fig = dict(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')
# Body Mass Index by Age Category
young_adults = df["bmi"].loc[df["age_cat"] == "Young Adult"].values
senior_adult = df["bmi"].loc[df["age_cat"] == "Senior Adult"].values
elders = df["bmi"].loc[df["age_cat"] == "Elder"].values
trace0 = go.Box(
y=young_adults,
name = 'Young Adults',
boxmean= True,
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=senior_adult,
name = 'Senior Adults',
boxmean= True,
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
trace2 = go.Box(
y=elders,
name = 'Elders',
boxmean= True,
marker = dict(
color = 'rgb(247, 186, 166)',
)
)
data = [trace0, trace1, trace2]
layout = go.Layout(title="Body Mass Index <br> by Age Category", xaxis=dict(title="Age Category", titlefont=dict(size=16)),
yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
df.head()
# Body Mass Index by Gender Category
female = df["bmi"].loc[df["gender"] == "female"].values
male = df["bmi"].loc[df["gender"] == "male"].values
trace0 = go.Box(
y=female,
name = 'Female',
boxmean= True,
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=male,
name = 'Male',
boxmean= True,
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
data = [trace0, trace1]
layout = go.Layout(title="Body Mass Index <br> by Gender Category", xaxis=dict(title="Gender Category",
titlefont=dict(size=16)),
yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Comparing Independent Categorical Variables to do ANOVA tests
# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore_lm = ols("bmi ~ age_cat", data=df).fit()
print(moore_lm.summary())
# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore_lm = ols("bmi ~ gender", data=df).fit()
print(moore_lm.summary())
# Body Mass Index of Smokers Status by Age Category
import plotly.graph_objs as go
ya_smoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "yes")].values
sa_smoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "yes")].values
e_smoker = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "yes")].values
# Non-Smokers
ya_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "no")].values
sa_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "no")].values
e_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "no")].values
x_data = ['Young A. Smoker', 'Young A. Non-Smoker',
'Senior A. Smoker', 'Senior A. Non-Smoker',
'Elder Smoker', 'Elder Non-Smoker',]
y0 = ya_smoker
y1 = ya_nonsmoker
y2 = sa_smoker
y3 = sa_nonsmoker
y4 = e_smoker
y5 = e_nonsmoker
y_data = [y0,y1,y2,y3,y4,y5]
colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Body Mass Index of Smokers Status by Age Category',
xaxis=dict(
title="Status",
titlefont=dict(
size=16)),
yaxis=dict(
title="Body Mass Index",
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
titlefont=dict(
size=16)
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='rgb(255, 255, 255)',
plot_bgcolor='rgb(255, 243, 192)',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
iplot(fig)
# Body Mass Index of Gender Status by Age Category
import plotly.graph_objs as go
ya_male = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].values
sa_male = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].values
e_male = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].values
# Female
ya_female = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].values
sa_female = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].values
e_female = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].values
x_data = ['Young A. Male', 'Young A. Female',
'Senior A. Male', 'Senior A. Female',
'Elder Male', 'Elder Female',]
y0 = ya_male
y1 = ya_female
y2 = sa_male
y3 = sa_female
y4 = e_male
y5 = e_female
y_data = [y0,y1,y2,y3,y4,y5]
colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Body Mass Index of Gender Status by Age Category',
xaxis=dict(
title="Status",
titlefont=dict(
size=16)),
yaxis=dict(
title="Body Mass Index",
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
titlefont=dict(
size=16)
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='rgb(255, 255, 255)',
plot_bgcolor='rgb(255, 243, 192)',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
iplot(fig)
Who got charged more on Average by Age?
# Mean could be affected easily by outliers or extreme cases.
# Means
avg_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].mean()
avg_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].mean()
avg_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].mean()
# Median
med_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].median()
med_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].median()
med_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].median()
average_plot = go.Bar(
x=['Young Adults', 'Senior Adults', 'Elder'],
y=[avg_ya_charge, avg_sa_charge, avg_e_charge],
name='Mean',
marker=dict(
color="#F5B041"
)
)
med_plot = go.Bar(
x=['Young Adults', 'Senior Adults', 'Elder'],
y=[med_ya_charge, med_sa_charge, med_e_charge],
name='Median',
marker=dict(
color="#48C9B0"
)
)
fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
subplot_titles=('Average Charge by Age','Median Charge by Age'),
shared_yaxes=True, print_grid=False)
fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)
fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""),
yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
# Mean could be affected easily by outliers or extreme cases.
# Means for male and female
avg_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].mean()
avg_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].mean()
avg_e_charge_male = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].mean()
avg_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].mean()
avg_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].mean()
avg_e_charge_female = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].mean()
# Median
med_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].median()
med_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].median()
med_e_charge_male = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].median()
med_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].median()
med_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].median()
med_e_charge_female = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].median()
average_plot = go.Bar(
x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
y=[avg_ya_charge_male, avg_sa_charge_male, avg_e_charge_male,
avg_ya_charge_female, avg_sa_charge_female, avg_e_charge_female],
name='Mean',
marker=dict(
color="#F5B041"
)
)
med_plot = go.Bar(
x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
y=[med_ya_charge_male, med_sa_charge_male, med_e_charge_male,
med_ya_charge_female, med_sa_charge_female, med_e_charge_female],
name='Median',
marker=dict(
color="#48C9B0"
)
)
fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
subplot_titles=('Average Charge by Age','Median Charge by Age'),
shared_yaxes=True, print_grid=False)
fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)
fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""),
yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
Weight Status: https://www.cancer.org/cancer/cancer-causes/diet-physical-activity/body-weight-and-cancer-risk/adult-bmi.html
Turning BMI into Categorical Variables:
df["weight_condition"] = np.nan
lst = [df]
for col in lst:
col.loc[ col["bmi"] < 18.5, "weight_condition"] = "Underweight"
col.loc[(col["bmi"] >= 18.5) & (col["bmi"] < 24.986), "weight_condition"] = "Normal Weight"
col.loc[(col["bmi"] >= 25) & (col["bmi"] < 29.926), "weight_condition"] = "Overweight"
col.loc[ col["bmi"] >= 30, "weight_condition"] = "Obese"
df.head()
dat = ff.create_table(df.head())
iplot(dat)
# Create subpplots
f, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(18,8))
# I wonder if the cluster that is on the top is from obese people
sns.stripplot(x="age_cat", y="charges", data=df, ax=ax1, linewidth=1, palette="Reds")
ax1.set_title("Relationship between Charges and Age")
sns.stripplot(x="age_cat", y="charges", hue="weight_condition", data=df, ax=ax2, linewidth=1, palette="Set2")
ax2.set_title("Relationship of Weight Condition, Age and Charges")
sns.stripplot(x="smoker", y="charges", hue="weight_condition", data=df, ax=ax3, linewidth=1, palette="Set2")
ax3.legend_.remove()
ax3.set_title("Relationship between Smokers and Charges")
plt.show()
# Make sure we don't have any null values
df[df.isnull().any(axis=1)]
Weight Status vs Charges
fig = ff.create_facet_grid(
df,
x='age',
y='charges',
color_name='weight_condition',
show_boxes=False,
marker={'size': 10, 'opacity': 1.0},
colormap={'Underweight': 'rgb(208, 246, 130)', 'Normal Weight': 'rgb(166, 246, 130)',
'Overweight': 'rgb(251, 232, 238)', 'Obese': 'rgb(253, 45, 28)'}
)
fig['layout'].update(title="Weight Status vs Charges", title_x = 0.5,
width=800, height=600, plot_bgcolor='rgb(251, 251, 251)',
paper_bgcolor='rgb(255, 255, 255)')
iplot(fig, filename='facet - custom colormap')
# First find the average or median of the charges obese people paid.
obese_avg = df["charges"].loc[df["weight_condition"] == "Obese"].mean()
df["charge_status"] = np.nan
lst = [df]
for col in lst:
col.loc[col["charges"] > obese_avg, "charge_status"] = "Above Average"
col.loc[col["charges"] < obese_avg, "charge_status"] = "Below Average"
dat = ff.create_table(df.head())
iplot(dat)
Obesity and the Impact of Smoking to the Wallet:
import seaborn as sns
sns.set(style="ticks")
pal = ["#FA5858", "#58D3F7"]
sns.pairplot(df, hue="smoker", palette=pal)
plt.title("Smokers")