{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SMclarify Bias Metrics for Marketing Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from smclarify.bias import report\n", "from typing import Dict\n", "from collections import defaultdict\n", "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the [marketing dataset]( https://archive.ics.uci.edu/ml/datasets/bank+marketing). " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!curl -o bank-additional.zip https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip\n", "!unzip -o bank-additional.zip -d /tmp/\n", "!rm -rf bank-additional.zip\n", "\n", "local_data_path = '/tmp/bank-additional/bank-additional-full.csv'\n", "df = pd.read_csv(local_data_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sns.pairplot(df[['age','campaign', 'pdays']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sns.countplot(data=df, x='y')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate pre-training bias metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Measure bias for the marital attribute\n", "facet_column = report.FacetColumn('marital')\n", "label_column = report.LabelColumn(name='y', series=df['y'], positive_label_values=['yes'])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "report.bias_report(df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=df['education'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Measure bias for the Age attribute, that we bucket into 3 bins. \n", "df['age_disc'] = pd.cut(df.age, bins=3, labels=['young', 'middle', 'old'])\n", "facet_column = report.FacetColumn('age_disc')\n", "label_column = report.LabelColumn(name='y', series=df['y'], positive_label_values=['yes'])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "report.bias_report(df, facet_column, label_column, stage_type=report.StageType.PRE_TRAINING, group_variable=df['education'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }