In [None]:
import boto3
import sagemaker
import json
import os
import pandas as pd
from utils import get_aws_profile_name, get_aws_iam_role

LOCAL_EXECUTION = True

if LOCAL_EXECUTION:
    sess = boto3.Session(profile_name=get_aws_profile_name())
    sm = sess.client("sagemaker")
    iam = sess.client('iam')
    role = iam.get_role(RoleName=get_aws_iam_role())['Role']['Arn']
else:
    sess = boto3.Session()
    sm = sess.client("sagemaker")
    role = sagemaker.get_execution_role()

sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
prefix = "model-monitor-bring-your-own-model/"
region = sess.region_name

In [None]:
baseline_data_prefix = prefix + "data-quality/baseline_input_data"
baseline_results_prefix = prefix + "data-quality/baseline_results"

baseline_process_model_prefix = prefix + "data-quality/preprocess_model_data"
baseline_model_prefix = prefix + "data-quality/model_data"


baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)


sagemaker_session.upload_data(
    path="data/train_data_no_target.csv", bucket=bucket, key_prefix=baseline_data_prefix
)


## Create a Data Quality Baseline

In [None]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

my_default_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session
)

my_default_monitor.suggest_baseline(
    baseline_dataset=baseline_data_uri+'/train_data_no_target.csv',
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    wait=True,
)

In [None]:
import pandas as pd

baseline_job = my_default_monitor.latest_baselining_job
schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
schema_df.head(10)

In [None]:
constraints_df = pd.io.json.json_normalize(
    baseline_job.suggested_constraints().body_dict["features"]
)
constraints_df.head(10)

In [None]:
baseline_job.outputs[0].destination


Save baselines in the config file for later use

In [None]:
from utils import save_baseline

save_baseline('data-quality-constraints', os.path.join(baseline_job.outputs[0].destination, "constraints.json"))
save_baseline('data-quality-statistics', os.path.join(baseline_job.outputs[0].destination, "statistics.json"))
save_baseline('data-quality-baseline-data', baseline_data_uri+'/train_data_no_target.csv')


## Model Quality Baseline

In [None]:
from sagemaker.model_monitor import ModelQualityMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat


model_quality_monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=sagemaker_session
)

In [None]:
baseline_data_prefix = prefix + "model-quality/baseline_input_data"
baseline_results_prefix = prefix + "model-quality/baseline_results"

baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)


sagemaker_session.upload_data(
    path="data/train_data_with_prediction.csv", bucket=bucket, key_prefix=baseline_data_prefix
)

job = model_quality_monitor.suggest_baseline(
    baseline_dataset=baseline_data_uri, # The S3 location of the validation dataset.
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri = baseline_results_uri, # The S3 location to store the results.
    problem_type='BinaryClassification',
    inference_attribute= "prediction", # The column in the dataset that contains predictions.
    probability_attribute= "prediction_probability", # The column in the dataset that contains probabilities.
    ground_truth_attribute= "credit_risk" # The column in the dataset that contains ground truth labels.
)
job.wait(logs=False)

In [None]:
baseline_job = model_quality_monitor.latest_baselining_job

In [None]:
save_baseline('model-quality-constraints', os.path.join(baseline_job.outputs[0].destination, "constraints.json"))
save_baseline('model-quality-statistics', os.path.join(baseline_job.outputs[0].destination, "statistics.json"))
save_baseline('model-quality-baseline-data', baseline_data_uri+'train_data_with_prediction.csv')

In [None]:
import pandas as pd
pd.DataFrame(baseline_job.suggested_constraints().body_dict["binary_classification_constraints"]).T

## Model Explainability baseline

Create processor model


In [None]:

from sagemaker.model import Model


image_uri = sagemaker.image_uris.retrieve("sklearn", region, "0.23-1")
model_name = "sm-preprocess-model-for-explainability"
dataset_type = "text/csv"
model_url = sagemaker_session.upload_data(
    path="data/processed/model.tar.gz", bucket=bucket, key_prefix=baseline_process_model_prefix
)

preprocessing_model = Model(
    role=role,
    name=model_name,
    image_uri=image_uri,
    model_data=model_url,
    entry_point="inference_preprocessing.py",
    sagemaker_session=sagemaker_session,
)


Create xgboost model

In [None]:
!cd data/trained/ && tar czvf model.tar.gz model.bin

from sagemaker.model import Model


image_uri = sagemaker.image_uris.retrieve("xgboost", region, "1.3-1")
model_name = "sm-model-for-explainability"
dataset_type = "text/csv"

model_url = sagemaker_session.upload_data(
    path="data/trained/model.tar.gz", bucket=bucket, key_prefix=baseline_model_prefix
)

xgboost_model = Model(
    role=role,
    name=model_name,
    image_uri=image_uri,
    model_data=model_url,
    entry_point="inference.py",
    sagemaker_session=sagemaker_session,
)

Create the pipeline Model

In [None]:
from sagemaker.pipeline import PipelineModel


model_name = "e2e-model"


pipeline_model = PipelineModel(
    name=model_name,
    role=role,
    models=[preprocessing_model, xgboost_model],
    sagemaker_session=sagemaker_session,
)

pipeline_model.create(instance_type="ml.m5.large")

In [None]:
from sagemaker.model_monitor import ModelExplainabilityMonitor
from sagemaker.clarify import DataConfig, SHAPConfig, ModelConfig
import pandas as pd
# from sagemaker.xgboost.model import XGBoostModel

baseline_data_prefix = prefix + "model-explainability/baseline_input_data"
baseline_results_prefix = prefix + "model-explainability/baseline_results"

baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)

# input_data_uri = sagemaker_session.upload_data(
#     path="data/train_data_no_target.csv", bucket=bucket, key_prefix=baseline_data_prefix
# )
input_data_uri = sagemaker_session.upload_data(
    path="data/train.csv", bucket=bucket, key_prefix=baseline_data_prefix
)


test_dataframe = pd.read_csv("data/train.csv")
all_headers = list(test_dataframe.columns)
label_header = "credit_risk"
# all_headers.remove(label_header)

# shap_baseline = [list(test_dataframe.drop(label_header, axis=1).mean())]
shap_baseline = [list(test_dataframe.drop(label_header, axis=1).mode().iloc[0].values.astype(int))]

shap_baseline = [[ int(i) for i in shap_baseline[0]]]

model_config = ModelConfig(
    model_name=model_name,
    instance_count=1,
    instance_type="ml.m5.large",
    content_type=dataset_type,
    accept_type=dataset_type,
)

model_explainability_monitor = ModelExplainabilityMonitor(
    role=role,
    sagemaker_session=sagemaker_session,
    max_runtime_in_seconds=1800,
)

model_explainability_data_config = DataConfig(
    s3_data_input_path=input_data_uri,
    s3_output_path=baseline_results_uri,
    label=label_header,
    headers=all_headers,
    dataset_type=dataset_type,
)

shap_config = SHAPConfig(
    baseline=shap_baseline,
    num_samples=100,
    agg_method="mean_abs",
    save_local_shap_values=True,
)


In [None]:
model_explainability_monitor.suggest_baseline(
    data_config=model_explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
    wait=True
)
print(
    f"ModelExplainabilityMonitor baselining job: {model_explainability_monitor.latest_baselining_job_name}"
)


In [None]:
model_explainability_monitor.latest_baselining_job.wait(logs=False)
model_explainability_constraints = model_explainability_monitor.suggested_constraints()
print()
print(
    f"ModelExplainabilityMonitor suggested constraints: {model_explainability_constraints.file_s3_uri}"
)
print(sagemaker.s3.S3Downloader.read_file(model_explainability_constraints.file_s3_uri, sagemaker_session))


In [None]:
from utils import save_baseline
baseline_job = model_explainability_monitor.latest_baselining_job
save_baseline('model-explainability-analysis', os.path.join(baseline_job.outputs[0].destination, "analysis.json"))
save_baseline('model-explainability-analysis_config', os.path.join(baseline_job.outputs[0].destination, "analysis_config.json"))
save_baseline('model-explainability-baseline-data', input_data_uri) 

# Model Bias Baseline

In [None]:
# Followig cell is identical with cell in Explainability baseline section above as its a common step
# only changing model name 

from sagemaker.model import Model

image_uri = sagemaker.image_uris.retrieve("sklearn", region, "0.23-1")
model_name = "sm-preprocess-model-for-bias"
dataset_type = "text/csv"
model_url = sagemaker_session.upload_data(
    path="data/processed/model.tar.gz", bucket=bucket, key_prefix=baseline_process_model_prefix
)

preprocessing_model = Model(
    role=role,
    name=model_name,
    image_uri=image_uri,
    model_data=model_url,
    entry_point="inference_preprocessing.py",
    sagemaker_session=sagemaker_session,
)

In [None]:
# Followig cell is identical with cell in Explainability baseline section above as its a common step
# only changing model name 

!cd data/trained/ && tar czvf model.tar.gz model.bin

from sagemaker.model import Model


image_uri = sagemaker.image_uris.retrieve("xgboost", region, "1.3-1")
model_name = "sm-model-for-bias"
dataset_type = "text/csv"

model_url = sagemaker_session.upload_data(
    path="data/trained/model.tar.gz", bucket=bucket, key_prefix=baseline_model_prefix
)

xgboost_model = Model(
    role=role,
    name=model_name,
    image_uri=image_uri,
    model_data=model_url,
    entry_point="inference.py",
    sagemaker_session=sagemaker_session,
)

In [None]:
from sagemaker.pipeline import PipelineModel


model_name = "e2e-model-bias"


pipeline_model = PipelineModel(
    name=model_name,
    role=role,
    models=[preprocessing_model, xgboost_model],
    sagemaker_session=sagemaker_session,
)

pipeline_model.create(instance_type="ml.m5.large")

In [None]:
from sagemaker.model_monitor import ModelBiasMonitor
from sagemaker.clarify import DataConfig, BiasConfig, ModelConfig
import pandas as pd
# from sagemaker.xgboost.model import XGBoostModel

baseline_data_prefix = prefix + "model-bias/baseline_input_data"
baseline_results_prefix = prefix + "model-bias/baseline_results"

baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)


input_data_uri = sagemaker_session.upload_data(
    path="data/train.csv", bucket=bucket, key_prefix=baseline_data_prefix
)

test_dataframe = pd.read_csv("data/train.csv")
all_headers = list(test_dataframe.columns)
label_header = "credit_risk"

model_config = ModelConfig(
    model_name=model_name,
    instance_count=1,
    instance_type="ml.m5.large",
    content_type=dataset_type,
    accept_type=dataset_type,
)

model_bias_monitor = ModelBiasMonitor(
    role=role,
    sagemaker_session=sagemaker_session,
    max_runtime_in_seconds=1800,
)

model_bias_data_config = DataConfig(
    s3_data_input_path=input_data_uri,
    s3_output_path=baseline_results_uri,
    label=label_header,
    headers=all_headers,
    dataset_type=dataset_type,
)

# BiasConfig is the configuration of the sensitive groups in the dataset. 
# Typically, bias is measured by computing a metric and comparing it across groups. 
# The group of interest is specified using the “facet.” 
bias_config = BiasConfig(
    label_values_or_threshold=[1],
    facet_name="foreign_worker",
    facet_values_or_threshold=[1],
)

In [None]:
model_bias_monitor.suggest_baseline(
    data_config=model_bias_data_config,
    model_config=model_config,
    bias_config=bias_config,
    wait=True
)
print(
    f"ModelBiasMonitor baselining job: {model_bias_monitor.latest_baselining_job_name}"
)

In [None]:
model_bias_monitor.latest_baselining_job.wait(logs=False)
model_bias_constraints = model_bias_monitor.suggested_constraints()
print()
print(
    f"ModelBiasMonitor suggested constraints: {model_bias_constraints.file_s3_uri}"
)
print(sagemaker.s3.S3Downloader.read_file(model_bias_constraints.file_s3_uri, sagemaker_session))

In [None]:
from utils import save_baseline
baseline_job = model_bias_monitor.latest_baselining_job
save_baseline('model-bias-analysis', os.path.join(baseline_job.outputs[0].destination, "analysis.json"))
save_baseline('model-bias-analysis_config', os.path.join(baseline_job.outputs[0].destination, "analysis_config.json"))
save_baseline('model-bias-baseline-data', input_data_uri) 

In [None]:
# DO NOT DELETE the model as the model object is going to be used when running ME Check, No costs are being incurred by the model object
# pipeline_model.delete_model()

In [None]:
# pipeline_model.deploy(initial_instance_count=1, instance_type="ml.m5.large", endpoint_name="manual-endpoint9")
# s = "1,18,4,2,1049,1,2,4,2,1,4,2,21,3,1,1,3,2,1,2"    
# predictor = sagemaker.Predictor("manual-endpoint9", sagemaker_session=sagemaker_session, serializer=sagemaker.serializers.CSVSerializer(),deserializer=sagemaker.deserializers.CSVDeserializer(), )
# predictor.predict(s)
# predictor.delete_endpoint()
