# Suppress warnings in Jupyter Notebooks
import warnings
warnings.filterwarnings("ignore")
In this end-to-end tutorials custering & regression, medical cost patient dataset has been used. The first step is to read necessary libraries.
The following libraries are used: pandas - to manipulate data frames numpy - providing linear algebra seaborm - to create nice visualizations matplotlib - basic tools for visualizations scikit-learn - machine learning library
import numpy as np
import pandas as pd
# Plotly Packages
import plotly
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from string import ascii_letters
# Statistical Libraries
from scipy.stats import norm
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
# Regression Modeling
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# Load dataset
df = pd.read_csv("insurance.csv")
# Let's store the original dataframe in another variable.
original_df = df.copy()
df.head()
dat = ff.create_table(df.head())
iplot(dat)
# Determine the distribution of charge
charge_dist = df["charges"].values
logcharge = np.log(df["charges"])
trace0 = go.Histogram(
x=charge_dist,
histnorm='probability',
name="Charges Distribution",
marker = dict(
color = '#FA5858',
)
)
trace1 = go.Histogram(
x=logcharge,
histnorm='probability',
name="Charges Distribution using Log",
marker = dict(
color = '#58FA82',
)
)
fig = plotly.subplots.make_subplots(rows=2, cols=1,
subplot_titles=('Distribution of Charge (Original Data)','Distribution of Charge (Log transform)'),
print_grid=False)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig['layout'].update(showlegend=True, title='Distribution of Charges ($Cost)', bargap=0.05)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
Turning Age into Categorical Variables: Young Adult: from 18 - 35 Senior Adult: from 36 - 55 Elder: 56 or older Share of each Category: Young Adults (42.9%), Senior Adults (41%) and Elder (16.1%)
df['age_cat'] = np.nan
lst = [df]
for col in lst:
col.loc[(col['age'] > 17) & (col['age'] <= 35), 'age_cat'] = 'Young Adult'
col.loc[(col['age'] > 35) & (col['age'] <= 55), 'age_cat'] = 'Senior Adult'
col.loc[ col['age'] > 55, 'age_cat'] = 'Elder'
labels = df["age_cat"].unique().tolist()
amount = df["age_cat"].value_counts().tolist()
colors = ["#ff9999", "#b3d9ff", " #e6ffb3"]
trace = go.Pie(labels=labels, values=amount,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=20),
marker=dict(colors=colors,
line=dict(color='#000000', width=2)))
data = [trace]
layout = go.Layout(title="Amount by Age Category")
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic_pie_chart')
Is there a Relationship between BMI and Age?
bmi = [df["bmi"].values.tolist()]
group_labels = ['Body Mass Index Distribution']
colors = ['#FA5858']
fig = ff.create_distplot(bmi, group_labels, colors=colors)
fig['layout'].update(title='Normal Distribution <br> Central Limit Theorem Condition') # Add title
iplot(fig, filename='Basic Distplot')
# Correlation heatmap
corr = df.corr()
print(corr)
hm = go.Heatmap(
z=corr.values,
x=corr.index.values.tolist(),
y=corr.index.values.tolist()
)
data = [hm]
layout = go.Layout(title="Correlation Heatmap")
fig = dict(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')
# Body Mass Index by Age Category
young_adults = df["bmi"].loc[df["age_cat"] == "Young Adult"].values
senior_adult = df["bmi"].loc[df["age_cat"] == "Senior Adult"].values
elders = df["bmi"].loc[df["age_cat"] == "Elder"].values
trace0 = go.Box(
y=young_adults,
name = 'Young Adults',
boxmean= True,
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=senior_adult,
name = 'Senior Adults',
boxmean= True,
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
trace2 = go.Box(
y=elders,
name = 'Elders',
boxmean= True,
marker = dict(
color = 'rgb(247, 186, 166)',
)
)
data = [trace0, trace1, trace2]
layout = go.Layout(title="Body Mass Index <br> by Age Category", xaxis=dict(title="Age Category", titlefont=dict(size=16)),
yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
df.head()
# Body Mass Index by Gender Category
female = df["bmi"].loc[df["gender"] == "female"].values
male = df["bmi"].loc[df["gender"] == "male"].values
trace0 = go.Box(
y=female,
name = 'Female',
boxmean= True,
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=male,
name = 'Male',
boxmean= True,
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
data = [trace0, trace1]
layout = go.Layout(title="Body Mass Index <br> by Gender Category", xaxis=dict(title="Gender Category",
titlefont=dict(size=16)),
yaxis=dict(title="Body Mass Index", titlefont=dict(size=16)))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Comparing Independent Categorical Variables to do ANOVA tests
# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore_lm = ols("bmi ~ age_cat", data=df).fit()
print(moore_lm.summary())
# Age Categories
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore_lm = ols("bmi ~ gender", data=df).fit()
print(moore_lm.summary())
# Body Mass Index of Smokers Status by Age Category
import plotly.graph_objs as go
ya_smoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "yes")].values
sa_smoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "yes")].values
e_smoker = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "yes")].values
# Non-Smokers
ya_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["smoker"] == "no")].values
sa_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["smoker"] == "no")].values
e_nonsmoker = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["smoker"] == "no")].values
x_data = ['Young A. Smoker', 'Young A. Non-Smoker',
'Senior A. Smoker', 'Senior A. Non-Smoker',
'Elder Smoker', 'Elder Non-Smoker',]
y0 = ya_smoker
y1 = ya_nonsmoker
y2 = sa_smoker
y3 = sa_nonsmoker
y4 = e_smoker
y5 = e_nonsmoker
y_data = [y0,y1,y2,y3,y4,y5]
colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Body Mass Index of Smokers Status by Age Category',
xaxis=dict(
title="Status",
titlefont=dict(
size=16)),
yaxis=dict(
title="Body Mass Index",
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
titlefont=dict(
size=16)
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='rgb(255, 255, 255)',
plot_bgcolor='rgb(255, 243, 192)',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
iplot(fig)
# Body Mass Index of Gender Status by Age Category
import plotly.graph_objs as go
ya_male = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].values
sa_male = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].values
e_male = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].values
# Female
ya_female = df["bmi"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].values
sa_female = df["bmi"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].values
e_female = df["bmi"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].values
x_data = ['Young A. Male', 'Young A. Female',
'Senior A. Male', 'Senior A. Female',
'Elder Male', 'Elder Female',]
y0 = ya_male
y1 = ya_female
y2 = sa_male
y3 = sa_female
y4 = e_male
y5 = e_female
y_data = [y0,y1,y2,y3,y4,y5]
colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Body Mass Index of Gender Status by Age Category',
xaxis=dict(
title="Status",
titlefont=dict(
size=16)),
yaxis=dict(
title="Body Mass Index",
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
titlefont=dict(
size=16)
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='rgb(255, 255, 255)',
plot_bgcolor='rgb(255, 243, 192)',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
iplot(fig)
Who got charged more on Average by Age?
# Mean could be affected easily by outliers or extreme cases.
# Means
avg_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].mean()
avg_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].mean()
avg_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].mean()
# Median
med_ya_charge = df["charges"].loc[df["age_cat"] == "Young Adult"].median()
med_sa_charge = df["charges"].loc[df["age_cat"] == "Senior Adult"].median()
med_e_charge = df["charges"].loc[df["age_cat"] == "Elder"].median()
average_plot = go.Bar(
x=['Young Adults', 'Senior Adults', 'Elder'],
y=[avg_ya_charge, avg_sa_charge, avg_e_charge],
name='Mean',
marker=dict(
color="#F5B041"
)
)
med_plot = go.Bar(
x=['Young Adults', 'Senior Adults', 'Elder'],
y=[med_ya_charge, med_sa_charge, med_e_charge],
name='Median',
marker=dict(
color="#48C9B0"
)
)
fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
subplot_titles=('Average Charge by Age','Median Charge by Age'),
shared_yaxes=True, print_grid=False)
fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)
fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""),
yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
# Mean could be affected easily by outliers or extreme cases.
# Means for male and female
avg_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].mean()
avg_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].mean()
avg_e_charge_male = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].mean()
avg_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].mean()
avg_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].mean()
avg_e_charge_female = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].mean()
# Median
med_ya_charge_male = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "male")].median()
med_sa_charge_male = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "male")].median()
med_e_charge_male = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "male")].median()
med_ya_charge_female = df["charges"].loc[(df["age_cat"] == "Young Adult") & (df["gender"] == "female")].median()
med_sa_charge_female = df["charges"].loc[(df["age_cat"] == "Senior Adult") & (df["gender"] == "female")].median()
med_e_charge_female = df["charges"].loc[(df["age_cat"] == "Elder") & (df["gender"] == "female")].median()
average_plot = go.Bar(
x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
y=[avg_ya_charge_male, avg_sa_charge_male, avg_e_charge_male,
avg_ya_charge_female, avg_sa_charge_female, avg_e_charge_female],
name='Mean',
marker=dict(
color="#F5B041"
)
)
med_plot = go.Bar(
x=['Young Adults Male', 'Senior Adults Male', 'Elder Male',
'Young Adults Female', 'Senior Adults Female', 'Elder Female'],
y=[med_ya_charge_male, med_sa_charge_male, med_e_charge_male,
med_ya_charge_female, med_sa_charge_female, med_e_charge_female],
name='Median',
marker=dict(
color="#48C9B0"
)
)
fig = plotly.subplots.make_subplots(rows=1, cols=2, specs=[[{}, {}]],
subplot_titles=('Average Charge by Age','Median Charge by Age'),
shared_yaxes=True, print_grid=False)
fig.append_trace(average_plot, 1, 1)
fig.append_trace(med_plot, 1, 2)
fig['layout'].update(showlegend=True, title='Age Charges', title_x=0.5, xaxis=dict(title=""),
yaxis=dict(title="Patient Charges"), bargap=0.15)
iplot(fig, filename='custom-sized-subplot-with-subplot-titles')
Weight Status: https://www.cancer.org/cancer/cancer-causes/diet-physical-activity/body-weight-and-cancer-risk/adult-bmi.html
Turning BMI into Categorical Variables:
df["weight_condition"] = np.nan
lst = [df]
for col in lst:
col.loc[ col["bmi"] < 18.5, "weight_condition"] = "Underweight"
col.loc[(col["bmi"] >= 18.5) & (col["bmi"] < 24.986), "weight_condition"] = "Normal Weight"
col.loc[(col["bmi"] >= 25) & (col["bmi"] < 29.926), "weight_condition"] = "Overweight"
col.loc[ col["bmi"] >= 30, "weight_condition"] = "Obese"
df.head()
dat = ff.create_table(df.head())
iplot(dat)
# Create subpplots
f, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(18,8))
# I wonder if the cluster that is on the top is from obese people
sns.stripplot(x="age_cat", y="charges", data=df, ax=ax1, linewidth=1, palette="Reds")
ax1.set_title("Relationship between Charges and Age")
sns.stripplot(x="age_cat", y="charges", hue="weight_condition", data=df, ax=ax2, linewidth=1, palette="Set2")
ax2.set_title("Relationship of Weight Condition, Age and Charges")
sns.stripplot(x="smoker", y="charges", hue="weight_condition", data=df, ax=ax3, linewidth=1, palette="Set2")
ax3.legend_.remove()
ax3.set_title("Relationship between Smokers and Charges")
plt.show()
# Make sure we don't have any null values
df[df.isnull().any(axis=1)]
Weight Status vs Charges
fig = ff.create_facet_grid(
df,
x='age',
y='charges',
color_name='weight_condition',
show_boxes=False,
marker={'size': 10, 'opacity': 1.0},
colormap={'Underweight': 'rgb(208, 246, 130)', 'Normal Weight': 'rgb(166, 246, 130)',
'Overweight': 'rgb(251, 232, 238)', 'Obese': 'rgb(253, 45, 28)'}
)
fig['layout'].update(title="Weight Status vs Charges", title_x = 0.5,
width=800, height=600, plot_bgcolor='rgb(251, 251, 251)',
paper_bgcolor='rgb(255, 255, 255)')
iplot(fig, filename='facet - custom colormap')
# First find the average or median of the charges obese people paid.
obese_avg = df["charges"].loc[df["weight_condition"] == "Obese"].mean()
df["charge_status"] = np.nan
lst = [df]
for col in lst:
col.loc[col["charges"] > obese_avg, "charge_status"] = "Above Average"
col.loc[col["charges"] < obese_avg, "charge_status"] = "Below Average"
dat = ff.create_table(df.head())
iplot(dat)
Obesity and the Impact of Smoking to the Wallet:
import seaborn as sns
sns.set(style="ticks")
pal = ["#FA5858", "#58D3F7"]
sns.pairplot(df, hue="smoker", palette=pal)
plt.title("Smokers")
# What Percentage of Obese that Smoked Paid aBove Average from the total obese patients?
# 79% of Obese were non-smokers while the 21% left were smokers
total_obese = len(df.loc[df["weight_condition"] == "Obese"])
obese_smoker_prop = len(df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")])/total_obese
obese_smoker_prop = round(obese_smoker_prop, 2)
obese_nonsmoker_prop = len(df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")])/total_obese
obese_nonsmoker_prop = round(obese_nonsmoker_prop, 2)
# Average charge by obese_smokers and obese_nonsmoker
charge_obese_smoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")].mean()
charge_obese_nonsmoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")].mean()
Distribution of Charges (Obese Smoker vs Obese non-Smoker)
pointspossmoker = [-0.9,-1.1,-0.6,-0.3]
pointposnonsmoker = [0.45,0.55,1,0.4]
showLegend = [True,False,False,False]
data = []
for i in range(0,len(pd.unique(df['weight_condition']))):
male = {
"type": 'violin',
"x": df['weight_condition'][(df['smoker'] == 'yes') &
(df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
"y": df['charges'][ (df['smoker'] == 'yes') & (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
"legendgroup": 'Smoker',
"scalegroup": 'Smoker',
"name": 'Smoker',
"side": 'negative',
"box": {
"visible": True
},
"points": 'all',
"pointpos": pointspossmoker[i],
"jitter": 0,
"scalemode": 'count',
"meanline": {
"visible": True
},
"line": {
"color": '#DF0101'
},
"marker": {
"line": {
"width": 2,
"color": '#F78181'
}
},
"span": [
0
],
"showlegend": showLegend[i]
}
data.append(male)
female = {
"type": 'violin',
"x": df['weight_condition'] [(df['smoker'] == 'no') &
(df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
"y": df['charges'] [ (df['smoker'] == 'no') & (df['weight_condition'] == pd.unique(df['weight_condition'])[i]) ],
"legendgroup": 'Non-Smoker',
"scalegroup": 'Non-Smoker',
"name": 'Non-Smoker',
"side": 'positive',
"box": {
"visible": True
},
"points": 'all',
"pointpos": pointposnonsmoker[i],
"jitter": 0,
"scalemode": 'count',
"meanline": {
"visible": True
},
"line": {
"color": '#00FF40'
},
"marker": {
"line": {
"width": 2,
"color": '#81F781'
}
},
"span": [
0
],
"showlegend": showLegend[i]
}
data.append(female)
fig = {
"data": data,
"layout" : {
"title": "Charges Distribution of Obese Patients<br><i>Group by Smoking Status",
"yaxis": {
"zeroline": False,
"title": "Patient Charges",
"titlefont": {
"size": 16
}
},
"violingap": 0,
"violingroupgap": 0,
"violinmode": "overlay"
}
}
iplot(fig, filename='violin/advanced', validate = False)
# we have to look closer into Obsese there is an obvious difference
chargedist_sm = df["charges"].loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")].values
chargedist_nsm = df["charges"].loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")].values
trace0 = go.Box(
y=chargedist_sm,
name = 'Obese Smokers',
marker = dict(
color = '#DF0101',
)
)
trace1 = go.Box(
y=chargedist_nsm,
name = 'Obese Non-Smokers',
marker = dict(
color = '#00FF40',
)
)
data = [trace0, trace1]
layout = dict(title="Deeper Look into Obese condition by Smoking status", title_x = 0.5,
xaxis=dict(
title="Status",
titlefont=dict(
size=16)),
yaxis=dict(title="Patient Charges",
titlefont=dict(size=16)),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Separation in Charges between Obese Smokers vs Non-Obese Smokers
In this chart we can visualize how can separate obese smokers and obese non-smokers into different clusters of groups. Therefore, we can say that smoking is a characteristic that definitely affects patient's charges.
# Create a Scatter Plot with all the Obese
obese_smoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "yes")]
obese_nonsmoker = df.loc[(df["weight_condition"] == "Obese") & (df["smoker"] == "no")]
trace0 = go.Scatter(
x = obese_smoker["age"].values,
y = obese_smoker["charges"].values,
name = 'Smokers',
mode = 'markers',
marker = dict(
size = 10,
color = '#DF0101',
line = dict(
width = 2,
color = 'rgb(0, 0, 0)'
)
)
)
trace1 = go.Scatter(
x = obese_nonsmoker["age"].values,
y = obese_nonsmoker["charges"].values,
name = 'Non-Smokers',
mode = 'markers',
marker = dict(
size = 10,
color = '#00FF40',
line = dict(
width = 2,
)
)
)
data = [trace0, trace1]
layout = dict(title = 'Clear Separation between Obese Smokers and Non-Smokers in Charges', title_x = 0.5,
yaxis = dict(zeroline = False,
title="Patient Charges",
titlefont=dict(size=16)),
xaxis = dict(zeroline = False,
title="Age of the Patient",
titlefont=dict(
size=16))
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='styled-scatter')
Average Patient Charge by Region:
df.head()
# Average charge by Region
df["region"].unique()
# Median Charges per Region
southwest = np.median(df["charges"].loc[df["region"] == "southwest"].values)
southeast = np.median(df["charges"].loc[df["region"] == "southeast"].values)
northwest = np.median(df["charges"].loc[df["region"] == "northwest"].values)
northeast = np.median(df["charges"].loc[df["region"] == "northeast"].values)
lst = [southwest, southeast, northwest, northeast]
data = [go.Scatterpolar(
r = [southwest, southeast, northwest, northeast],
theta = ['SouthWest', 'SouthEast', 'NorthWest', 'NorthEast'],
fill = 'toself'
)]
layout = go.Layout(
title="Median Charged to Patients by Region",
paper_bgcolor = "rgb(255, 255, 224)",
polar = dict(
radialaxis = dict(
visible = True,
range = [0, max(lst)]
)
),
showlegend = False
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "radar/basic")
Average Charge by Region depending on the Weight Condition:
# Weight Condition by Region Radar plots
df["weight_condition"].unique()
# Average charges for overweight patients by region
sw_overweight = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Overweight")].values)
se_overweight = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Overweight")].values)
nw_overweight = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Overweight")].values)
ne_overweight = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Overweight")].values)
# Obese
sw_obese = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Obese")].values)
se_obese = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Obese")].values)
nw_obese = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Obese")].values)
ne_obese = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Obese")].values)
# Normal Weight
sw_nw = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Normal Weight")].values)
se_nw = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Normal Weight")].values)
nw_nw = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Normal Weight")].values)
ne_nw = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Normal Weight")].values)
# Underweight
sw_uw = np.mean(df["charges"].loc[(df["region"] == "southwest") & (df["weight_condition"] == "Underweight")].values)
se_uw = np.mean(df["charges"].loc[(df["region"] == "southeast") & (df["weight_condition"] == "Underweight")].values)
nw_uw = np.mean(df["charges"].loc[(df["region"] == "northwest") & (df["weight_condition"] == "Underweight")].values)
ne_uw = np.mean(df["charges"].loc[(df["region"] == "northeast") & (df["weight_condition"] == "Underweight")].values)
# Labels
weight_labels = df["weight_condition"].unique().tolist()
# List per weight condition
sw_weights = [sw_overweight, sw_obese, sw_nw, sw_uw]
se_weights = [se_overweight, se_overweight, se_nw, se_uw]
nw_weights = [nw_overweight, nw_overweight, nw_nw, nw_uw]
ne_weights = [ne_overweight, ne_overweight, ne_nw, ne_uw]
data = [
go.Scatterpolar(
mode="lines+markers",
r = sw_weights,
theta = weight_labels,
fill = 'toself',
name="SouthWest",
line=dict(
color="rgba(0, 128, 128, 0.95)"
),
marker=dict(
color="rgba(0, 74, 147, 1)",
symbol="square",
size=8
),
subplot = "polar"
),
go.Scatterpolar(
mode="lines+markers",
r = se_weights,
theta = weight_labels,
fill = 'toself',
name="SouthEast",
line=dict(
color="rgba(255, 72, 72, 0.95)"
),
marker=dict(
color="rgba(219, 0, 0, 1)",
symbol="square",
size=8
),
subplot = "polar2"
),
go.Scatterpolar(
mode="lines+markers",
r = nw_weights,
theta = weight_labels,
fill = 'toself',
name="NorthWest",
line=dict(
color="rgba(72, 255, 72, 0.95)"
),
marker=dict(
color="rgba(0, 147, 74, 1)",
symbol="square",
size=8
),
subplot = "polar3"
),
go.Scatterpolar(
mode="lines+markers",
r = ne_weights,
theta = weight_labels,
fill = 'toself',
name="NorthEast",
line=dict(
color="rgba(247, 133, 11, 0.95)"
),
marker=dict(
color="rgba(245, 168, 86, 1)",
symbol="square",
size=8
),
subplot = "polar4"
)
]
layout = go.Layout(
title="Average Patient Charges <br> by Region <br>(Depending on the Patient's Weight Condition)", title_x = 0.5,
showlegend = False,
paper_bgcolor = "rgb(252, 234, 161)",
polar = dict(
domain = dict(
x = [0, 0.46],
y = [0.56, 1]
),
radialaxis = dict(
tickfont = dict(
size = 6
)
),
angularaxis = dict(
tickfont = dict(
size = 8
),
rotation = 40,
direction = "clockwise"
)
),
polar2 = dict(
domain = dict(
x = [0, 0.46],
y = [0, 0.44]
),
radialaxis = dict(
tickfont = dict(
size = 6
)
),
angularaxis = dict(
tickfont = dict(
size = 8
),
rotation = 40,
direction = "clockwise"
),
),
polar3 = dict(
domain = dict(
x = [0.54, 1],
y = [0.56, 1]
),
radialaxis = dict(
tickfont = dict(
size = 6
)
),
angularaxis = dict(
tickfont = dict(
size = 8
),
rotation = 40,
direction = "clockwise"
),
),
polar4 = dict(
domain = dict(
x = [0.54, 1],
y = [0, 0.44]
),
radialaxis = dict(
tickfont = dict(
size = 6
)
),
angularaxis = dict(
tickfont = dict(
size = 8
),
rotation = 40,
direction = "clockwise"
),
))
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(height=800, width=800)
iplot(fig, filename='polar/directions')
Unsupervised Learning:
Performing Clustering in a Manual Way: In the first plot we will do a cluster analysis in a manual form and see what our eyes can discover. Here are the following results from the manual cluster analysis performed.
# Two subplots one with weight condition and the other with smoker.
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18,8))
sns.scatterplot(x="bmi", y="charges", hue="weight_condition", data=df, palette="Set1", ax=ax1)
ax1.set_title("Relationship between Charges and BMI by Weight Condition")
ax1.annotate('Obese Cluster \n (Does this cluster has \n the Smoking Attribute?)', xy=(37, 50000), xytext=(30, 60000),
arrowprops=dict(facecolor='black'),
fontsize=12)
sns.scatterplot(x="bmi", y="charges", hue="smoker", data=df, palette="Set1", ax=ax2)
ax2.set_title("Relationship between Charges and BMI by Smoking Condition")
ax2.annotate('Obese Smoker Cluster ', xy=(35, 48000), xytext=(20, 60000),
arrowprops=dict(facecolor='black'),
fontsize=12)
ax2.annotate('The Impact of Smoking to \n Charges on other \n Weight Conditions ', xy=(25, 26000), xytext=(17, 40000),
arrowprops=dict(facecolor='black'),
fontsize=12)