# SMclarify Bias Metrics for Marketing Data

In [None]:
from smclarify.bias import report
from typing import Dict
from collections import defaultdict
import pandas as pd
import seaborn as sns

Get the [marketing dataset]( https://archive.ics.uci.edu/ml/datasets/bank+marketing). 

In [None]:
!curl -o bank-additional.zip https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
!unzip -o bank-additional.zip -d /tmp/
!rm -rf bank-additional.zip

local_data_path = '/tmp/bank-additional/bank-additional-full.csv'
df = pd.read_csv(local_data_path)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
sns.pairplot(df[['age','campaign', 'pdays']])

In [None]:
sns.countplot(data=df, x='y')

## Calculate pre-training bias metrics

In [None]:
# Measure bias for the marital attribute
facet_column = report.FacetColumn('marital')
label_column = report.LabelColumn(name='y', series=df['y'], positive_label_values=['yes'])


In [None]:
report.bias_report(df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=df['education'])

In [None]:
# Measure bias for the Age attribute, that we bucket into 3 bins. 
df['age_disc'] = pd.cut(df.age, bins=3, labels=['young', 'middle', 'old'])
facet_column = report.FacetColumn('age_disc')
label_column = report.LabelColumn(name='y', series=df['y'], positive_label_values=['yes'])


In [None]:
report.bias_report(df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=df['education'])