In [None]:
import boto3
import sagemaker
import os, sys

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "sagemaker/DEMO-automl-shap"
region = boto3.Session().region_name

# Role when working on a notebook instance
role = sagemaker.get_execution_role()

In [None]:
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
sm_rt = boto3.Session().client("runtime.sagemaker", region_name=region)

## Dataset

Download the direct marketing dataset.

[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip --no-check-certificate
!unzip -o bank-additional.zip

In [None]:
!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip --no-check-certificate
!unzip -o bank-additional.zip

In [None]:
import numpy as np
import pandas as pd

from sagemaker.analytics import ExperimentAnalytics
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.estimator import Estimator

In [None]:
data = pd.read_csv("./bank-additional/bank-additional-full.csv", sep=";")
pd.set_option("display.max_columns", 500)  # Make sure we can see all of the columns
pd.set_option("display.max_rows", 50)  # Keep the output on one page
data[:10]  # Show the first 10 lines

In [None]:
data.shape

In [None]:
train_data, test_data, _ = np.split(
    data.sample(frac=1, random_state=123), [int(0.95 * len(data)), int(len(data))]
)

# Save to CSV files
train_data.to_csv(
    "automl-train.csv", index=False, header=True, sep=","
)  # Need to keep column names
test_data.to_csv("automl-test.csv", index=False, header=True, sep=",")

In [None]:
sess.upload_data(path="automl-train.csv", key_prefix=prefix + "/input")

In [None]:
train_data.drop("y", axis=1).to_csv(
    "automl-validation.csv", index=False, header=True, sep=","
)
sess.upload_data(path="automl-validation.csv", key_prefix=prefix + "/validation")

## AutoML

In [None]:
job_config = {
    "CompletionCriteria": {
        "MaxRuntimePerTrainingJobInSeconds": 300,
        "MaxCandidates": 50,
        "MaxAutoMLJobRuntimeInSeconds": 3600,
    }
}

input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": f"s3://{bucket}/{prefix}/input",
            }
        },
        "TargetAttributeName": "y",
    }
]

output_data_config = {"S3OutputPath": f"s3://{bucket}/{prefix}/output"}

problem_type = "BinaryClassification"

job_objective = {"MetricName": "F1"}

In [None]:
from time import gmtime, strftime, sleep

timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())
auto_ml_job_name = "automl-shap-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

In [None]:
sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=job_config,
    AutoMLJobObjective=job_objective,
    ProblemType=problem_type,
    RoleArn=role,
)

In [None]:
%%time

job_run_status = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)[
    "AutoMLJobStatus"
]
print(job_run_status)

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"]
        + " - "
        + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(20)

### Auto-generated Notebook

In [None]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_candidate_notebook = job["AutoMLJobArtifacts"][
    "CandidateDefinitionNotebookLocation"
]
job_data_notebook = job["AutoMLJobArtifacts"]["DataExplorationNotebookLocation"]

print(job_candidate_notebook)
print(job_data_notebook)


def download_gen_notebook(path):
    bucket, key = path.split("/", 2)[-1].split("/", 1)
    boto3.client("s3").download_file(
        Bucket=bucket, Key=key, Filename=key.split("/")[-1]
    )
    return bucket, key


download_gen_notebook(job_candidate_notebook)
download_gen_notebook(job_data_notebook)

### All the Experiment Candidates by AutoML

In [None]:
analytics = ExperimentAnalytics(
    sagemaker_session=sagemaker.Session(),
    experiment_name=auto_ml_job_name + "-aws-auto-ml-job",
)
analytics.dataframe()

### All the Model Tuning by AutoML

In [None]:
candidates = sm.list_candidates_for_auto_ml_job(
    AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
)["Candidates"]

index = 1
for candidate in candidates:
    print(
        str(index)
        + "  "
        + candidate["CandidateName"]
        + "  "
        + str(candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
    )
    index += 1

In [None]:
sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)[
    "BestCandidate"
]
best_candidate_name = best_candidate["CandidateName"]

print("Candidate name: " + best_candidate_name)

In [None]:
best_candidate["InferenceContainers"]

for container in best_candidate["InferenceContainers"]:
    print(container["Image"])
    print(container["ModelDataUrl"])
    print("-")

In [None]:
best_candidate["InferenceContainers"]

In [None]:
def model_chain(best_candidate, names):
    assert len(best_candidate["InferenceContainers"]) == len(names)
    model_chain_list = []
    container = best_candidate["InferenceContainers"][0]

    model = sagemaker.model.Model(
        model_data=container["ModelDataUrl"],
        image=container["Image"],
        env=container["Environment"],
        role=role,
        sagemaker_session=sagemaker.Session(),
        name="feature_engineering",
    )

    return [model]


res = model_chain(best_candidate, ["feature_engineering", "model", "label_transform"])

ppl = sagemaker.pipeline.PipelineModel(
    res, role=role, name="fe-chain-diy", sagemaker_session=sagemaker.Session()
)

In [None]:
for r in res:
    print(r.name)

In [None]:
data_transformer = res[0]
data_transformer.model_data

## download the generated artifacts

In [None]:
!aws s3 cp {data_transformer.model_data} feature_engineering_data_transformer.tar.gz

In [None]:
!tar -xvf feature_engineering_data_transformer.tar.gz

## load the artifacts

In [None]:
from joblib import load

data_transformer = load(filename="model.joblib")

In [None]:
data_transformer.feature_transformer

In [None]:
data_transformer.feature_transformer["column_transformer"].transformers_

In [None]:
category_id = data_transformer.feature_transformer["column_transformer"].transformers_[
    1
][2]
new_cat_col = (
    data_transformer.feature_transformer["column_transformer"]
    .transformers_[1][1]["thresholdonehotencoder"]
    .get_feature_names(data.columns[category_id])
)

In [None]:
new_cat_col

In [None]:
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        print(name, features)
        if name != "remainder":
            for i in pipe:
                trans_features = []
                if hasattr(i, "categories_"):
                    trans_features.extend(i.get_feature_names(data.columns[features]))
                else:
                    trans_features = data.columns[features]
            output_features.extend(trans_features)

    return output_features

In [None]:
new_col = get_transformer_feature_names(
    data_transformer.feature_transformer["column_transformer"]
)

## Feature Engineering PipelineModel Deployment

In [None]:
# data capture configuration
s3_capture_path = f"s3://{bucket}/" + "capture" + "/"
print(s3_capture_path)

In [None]:
data_capture_config = sagemaker.model_monitor.DataCaptureConfig(
    True,
    sampling_percentage=100,
    destination_s3_uri=s3_capture_path,
    capture_options=["REQUEST", "RESPONSE"],
    csv_content_types=["text/csv"],
    json_content_types=["application/json"],
    sagemaker_session=sagemaker.Session(),
)

In [None]:
ppl.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    endpoint_name=ppl.name,
    update_endpoint=False,
    wait=True,
    data_capture_config=data_capture_config,
)

## Feature Enigneering Data Transformation

In [None]:
from io import StringIO
from sagemaker.predictor import RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV


predictor = RealTimePredictor(
    endpoint="fe-chain-diy",
    sagemaker_session=sagemaker.Session(),
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_CSV,
)

In [None]:
test_data_2_transform = test_data.drop("y", axis=1)
test_data_transformed = predictor.predict(
    test_data_2_transform.to_csv(sep=",", header=None, index=False)
).decode("utf-8")
test_data_transformed_df = pd.read_csv(StringIO(test_data_transformed), header=None)
test_data_transformed_df.columns = new_col
test_data_transformed_df["y"] = test_data["y"].values
test_data_transformed_df

In [None]:
train_data.head()

In [None]:
train_data_2_transform = train_data.drop("y", axis=1)
train_data_transformed = predictor.predict(
    train_data_2_transform.to_csv(sep=",", header=None, index=False)
).decode("utf-8")
train_data_transformed_df = pd.read_csv(StringIO(train_data_transformed), header=None)
train_data_transformed_df.columns = new_col
train_data_transformed_df["y"] = train_data["y"].values
train_data_transformed_df

In [None]:
assert prediction_df.shape[0] == test_data.shape[0]
sum(prediction_df.values[:,0] == test_data['y'].values) / len(prediction_df)

In [None]:
train_data_transformed_df.to_csv(
    "automl-transformed-train-2nd.csv", index=False, header=False, sep=","
)
test_data_transformed_df.to_csv(
    "automl-transformed-test-2nd.csv", index=False, header=False, sep=","
)

## Cleanup Feature Engineering Transformation Endpoint

In [None]:
sm.delete_endpoint(EndpointName="fe-chain-diy")
sm.delete_endpoint_config(EndpointConfigName="fe-chain-diy")
sm.delete_model(ModelName="fe-chain-diy")

## Hyperparameter of the best model

In [None]:
!python -m pip install shap
!python -m pip install smdebug

In [None]:
hyper_tunning_name = "-".join(best_candidate_name.split("-")[:4])
tuner = sagemaker.HyperparameterTuningJobAnalytics(
    hyperparameter_tuning_job_name=hyper_tunning_name
)

full_df = tuner.dataframe().sort_values("FinalObjectiveValue", ascending=False)
full_df

In [None]:
tune_job_summary = pd.DataFrame(tuner.training_job_summaries())
tune_job_summary["MetricValue"] = tune_job_summary.apply(
    lambda x: x["FinalHyperParameterTuningJobObjectiveMetric"]["Value"], axis=1
)
tune_job_summary.sort_values(
    by="MetricValue", ascending=False, na_position="first", inplace=True
)
best_hyper = tune_job_summary.iloc[[0]]["TunedHyperParameters"].values[0]
model_type = (
    tune_job_summary.iloc[[0]]["TrainingJobDefinitionName"].values[0].split("-")[-1]
)
best_hyper, model_type

In [None]:
if model_type == "xgb":
    container = get_image_uri(region, "xgboost", repo_version="0.90-2")

base_job_name = "smdebug-xgboost-prediction"
bucket_path = f"s3://{bucket}"
save_interval = 1

In [None]:
xgboost_estimator = Estimator(
    role=role,
    base_job_name=base_job_name,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    image_name=container,
    hyperparameters=best_hyper,
    train_max_run=1800,
    debugger_hook_config=DebuggerHookConfig(
        s3_output_path=bucket_path,  # Required
        collection_configs=[
            CollectionConfig(
                name="metrics", parameters={"save_interval": str(save_interval)}
            ),
            CollectionConfig(
                name="feature_importance",
                parameters={"save_interval": str(save_interval)},
            ),
            CollectionConfig(
                name="full_shap", parameters={"save_interval": str(save_interval)}
            ),
            CollectionConfig(
                name="average_shap", parameters={"save_interval": str(save_interval)}
            ),
        ],
    ),
    rules=[
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                "collection_names": "metrics",
                "num_steps": str(save_interval * 2),
            },
        )
    ],
)

In [None]:
from sagemaker.session import s3_input

sess.upload_data(
    path="automl-transformed-train-2nd.csv", key_prefix=prefix + "/transformedtrain-2nd"
)
sess.upload_data(
    path="automl-transformed-test-2nd.csv", key_prefix=prefix + "/transformedtest-2nd"
)

In [None]:
train_input = s3_input(
    "s3://{}/{}/{}".format(
        bucket, prefix, "transformedtrain-2nd/automl-transformed-train-2nd.csv"
    ),
    content_type="csv",
)
validation_input = s3_input(
    "s3://{}/{}/{}".format(
        bucket, prefix, "transformedtest-2nd/automl-transformed-test-2nd.csv"
    ),
    content_type="csv",
)
xgboost_estimator.fit(
    {"train": train_input, "validation": validation_input},
    # This is a fire and forget event. By setting wait=False, you submit the job to run in the background.
    # Amazon SageMaker starts one training job and release control to next cells in the notebook.
    # Follow this notebook to see status of the training job.
    wait=False,
)

In [None]:
import time

for _ in range(36):
    job_name = xgboost_estimator.latest_training_job.name
    client = xgboost_estimator.sagemaker_session.sagemaker_client
    description = client.describe_training_job(TrainingJobName=job_name)
    training_job_status = description["TrainingJobStatus"]
    rule_job_summary = xgboost_estimator.latest_training_job.rule_job_summary()
    rule_evaluation_status = rule_job_summary[0]["RuleEvaluationStatus"]
    print(
        "Training job status: {}, Rule Evaluation Status: {}".format(
            training_job_status, rule_evaluation_status
        )
    )

    if training_job_status in ["Completed", "Failed"]:
        break

    time.sleep(10)

In [None]:
xgboost_estimator.latest_training_job.rule_job_summary()

In [None]:
from smdebug.trials import create_trial

s3_output_path = xgboost_estimator.latest_job_debugger_artifacts_path()
trial = create_trial(s3_output_path)
trial.tensor_names()

In [None]:
from itertools import islice
import matplotlib.pyplot as plt
import re

MAX_PLOTS = 35


def get_data(trial, tname):
    """
    For the given tensor name, walks though all the iterations
    for which you have data and fetches the values.
    Returns the set of steps and the values.
    """
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals


def plot_collection(trial, collection_name, regex=".*", figsize=(8, 6)):
    """
    Takes a `trial` and a collection name, and 
    plots all tensors that match the given regex.
    """
    fig, ax = plt.subplots(figsize=figsize)
    tensors = sorted(trial.collection(collection_name).tensor_names)
    matched_tensors = [t for t in tensors if re.match(regex, t)]
    for tensor_name in islice(matched_tensors, MAX_PLOTS):
        steps, data = get_data(trial, tensor_name)
        ax.plot(steps, data, label=tensor_name)

    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    ax.set_xlabel("Iteration")

In [None]:
plot_collection(trial, "metrics")

In [None]:
def plot_feature_importance(trial, importance_type="weight"):
    SUPPORTED_IMPORTANCE_TYPES = [
        "weight",
        "gain",
        "cover",
        "total_gain",
        "total_cover",
    ]
    if importance_type not in SUPPORTED_IMPORTANCE_TYPES:
        raise ValueError(
            f"{importance_type} is not one of the supported importance types."
        )
    plot_collection(
        trial, "feature_importance", regex=f"feature_importance/{importance_type}/.*"
    )


plot_feature_importance(trial, importance_type="cover")

In [None]:
plot_collection(trial, "average_shap")

In [None]:
import shap

shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)
shap_no_base = shap_values[:, :-1]
shap_base_value = shap_values[0, -1]
shap.summary_plot(shap_no_base, plot_type="bar", feature_names=new_col)

In [None]:
shap.summary_plot(shap_no_base, train_data_transformed_df.iloc[:, :-1])

In [None]:
shap.initjs()

In [None]:
idx = 4000
shap.force_plot(
    shap_base_value,
    shap_no_base[idx, :],
    train_data_transformed_df.iloc[idx, :-1],
    link="logit",
    matplotlib=True,
)

In [None]:
shap.waterfall_plot(
    shap_base_value, shap_no_base[idx, :], train_data_transformed_df.iloc[idx, :-1]
)

In [None]:
import numpy as np

N_ROWS = shap_no_base.shape[0]
N_SAMPLES = min(100, N_ROWS)
sampled_indices = np.random.randint(N_ROWS, size=N_SAMPLES)

shap.force_plot(
    shap_base_value,
    shap_no_base[sampled_indices, :],
    train_data_transformed_df.iloc[sampled_indices, :-1],
    link="logit",
)

In [None]:
# top outliers
from scipy import stats

N_OUTLIERS = 3  # number of outliers on each side of the tail

shap_sum = np.sum(shap_no_base, axis=1)
z_scores = stats.zscore(shap_sum)
outlier_indices = (np.argpartition(z_scores, -N_OUTLIERS)[-N_OUTLIERS:]).tolist()
outlier_indices += (np.argpartition(z_scores, N_OUTLIERS)[:N_OUTLIERS]).tolist()

In [None]:
for fig_index, outlier_index in enumerate(outlier_indices, start=1):
    shap.force_plot(
        shap_base_value,
        shap_no_base[outlier_index, :],
        train_data_transformed_df.iloc[outlier_index, :-1],
        matplotlib=True,
        link="logit",
    )