### Main Steps:

1. Train and host a Sagemaker model - sagemaker
2. Import the sagemaker endpoint to AFD and set up the detector - AFD
3. Test the detector - GEP/Batch Prediction - AFD


In [1]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
s3_prefix = "sagemaker/demo-afd-sagemaker-endpoint"
version_prefix = 'v1'

# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

### Step 1: Train and Host a Sagemaker model

Code Reference: https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_applying_machine_learning/xgboost_customer_churn/xgboost_customer_churn.ipynb

In [3]:
data = pd.read_csv("fraud_data_20K_sample.csv")
data['EVENT_LABEL'].value_counts()

legit    18996
fraud     1004
Name: EVENT_LABEL, dtype: int64

In [4]:
data.head()

Unnamed: 0,EVENT_LABEL,EVENT_TIMESTAMP,ip_address,email_address,order_amt,prev_amt
0,legit,10/8/2019 20:44,46.41.252.160,fake_acostasusan@example.org,153.71,58.3
1,legit,5/23/2020 19:44,152.58.247.12,fake_christopheryoung@example.com,2.57,11.63
2,legit,4/24/2020 18:26,12.252.206.222,fake_jeffrey09@example.org,30.96,52.41
3,legit,4/22/2020 19:07,170.81.164.240,fake_ncastro@example.org,63.87,34.21
4,legit,12/31/2019 17:08,165.182.68.217,fake_charles99@example.org,70.36,66.58


In [5]:
# prepare data for sagemaker model training
model_data = pd.get_dummies(data[['order_amt', 'prev_amt', 'EVENT_LABEL']])
model_data = pd.concat([model_data["EVENT_LABEL_fraud"], model_data.drop(["EVENT_LABEL_fraud", "EVENT_LABEL_legit"], axis=1)], axis=1)

In [6]:
# split to train valid and test data
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)
train_data.to_csv("train.csv", header=False, index=False)
validation_data.to_csv("validation.csv", header=False, index=False)

In [7]:
# upload to s3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(s3_prefix, "train/train.csv")
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(s3_prefix, "validation/validation.csv")
).upload_file("validation.csv")

In [8]:
# specify the locations of the XGBoost algorithm containers - 
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.5-1")
display(container)

'246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.5-1'

In [9]:
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, s3_prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/validation/".format(bucket, s3_prefix), content_type="csv"
)

In [10]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, s3_prefix),
    sagemaker_session=sess,
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100,
)

xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

2023-03-23 16:27:01 Starting - Starting the training job...ProfilerReport-1679588820: InProgress
...
2023-03-23 16:28:00 Starting - Preparing the instances for training.........
2023-03-23 16:29:20 Downloading - Downloading input data...
2023-03-23 16:30:00 Training - Downloading the training image...
2023-03-23 16:30:26 Training - Training image download completed. Training in progress...[34m[2023-03-23 16:30:36.896 ip-10-0-252-124.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-03-23 16:30:36.977 ip-10-0-252-124.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-03-23:16:30:37:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-03-23:16:30:37:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-23:16:30:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-03

In [11]:
# deploy sagemaker endpoint
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer(),
    endpoint_name = f"sagemaker-xgb-endpoint-{version_prefix}"
)

-------!

In [12]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = "".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return predictions.split("\n")[:-1]


predictions = predict(test_data.to_numpy()[:, 1:])

In [13]:
predictions = np.array([float(num) for num in predictions])
print(len(predictions), predictions)

2000 [0.0716714  0.03765393 0.02415792 ... 0.05634578 0.06239426 0.03940216]


### Step 2: Import the SageMaker model to AFD and set up the detector

In [14]:
fraudDetector = boto3.client('frauddetector')



In [15]:
### create afd variables, entity and event type
try:
    resp = fraudDetector.get_variables(name = 'order_amt')
except:
    resp = fraudDetector.create_variable(name = 'order_amt', dataType = 'FLOAT', dataSource ='EVENT', defaultValue = '0.0')

try:
    resp = fraudDetector.get_variables(name = 'prev_amt')
except:  
    resp = fraudDetector.create_variable(name = 'prev_amt', dataType = 'FLOAT', dataSource ='EVENT', defaultValue = '0.0')

response = fraudDetector.put_entity_type(name = f'sagemaker-xgb-entity-{version_prefix}')

response = fraudDetector.put_event_type (
        name           = f'sagemaker-xgb-transaction-{version_prefix}',
        eventVariables = ['order_amt', 'prev_amt'],
        entityTypes    = [f'sagemaker-xgb-entity-{version_prefix}'])

In [16]:
### create external model score variable
resp = fraudDetector.create_variable(name = f'sagemaker_xgb_score_{version_prefix}', dataType = 'FLOAT', dataSource ='EXTERNAL_MODEL_SCORE', defaultValue = '0.0')


In [17]:
### put external model
# https://docs.aws.amazon.com/frauddetector/latest/ug/import-an-amazon-sagemaker-model.html
fraudDetector.put_external_model(
    modelSource = 'SAGEMAKER',
    modelEndpoint = f'sagemaker-xgb-endpoint-{version_prefix}',
    invokeModelEndpointRoleArn = role, #'your_SagemakerExecutionRole_arn',
    inputConfiguration = {
        'useEventVariables' : True,
        'eventTypeName' : f'sagemaker-xgb-transaction-{version_prefix}',
        'format' : 'TEXT_CSV',
        'csvInputTemplate' : '{{order_amt}}, {{prev_amt}}' # add afd enrichment, how the config works
    },
    outputConfiguration = {
        'format' : 'TEXT_CSV',
        'csvIndexToVariableMap' : {
        '0' : f'sagemaker_xgb_score_{version_prefix}'
        }
    },
    modelEndpointStatus = 'ASSOCIATED'
)

{'ResponseMetadata': {'RequestId': '45e3a103-ab54-4ce9-9656-8f34665d5bba',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 16:35:21 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '45e3a103-ab54-4ce9-9656-8f34665d5bba'},
  'RetryAttempts': 0}}

In [18]:
### create a detector
DETECTOR_NAME = f"afd-with-sagemaker-model-{version_prefix}"
response = fraudDetector.put_detector(
    detectorId    = DETECTOR_NAME, 
    eventTypeName = f'sagemaker-xgb-transaction-{version_prefix}' )

In [19]:
### Create rules

def create_outcomes(outcomes):
    """ 
    Create Fraud Detector Outcomes 
    """   
    for outcome in outcomes:
        print("creating outcome variable: {0} ".format(outcome))
        response = fraudDetector.put_outcome(name = outcome, description = outcome)

def create_rules(score_cuts, outcomes, MODEL_SCORE_NAME, DETECTOR_NAME):
    """
    Creating rules 
    
    Arguments:
        score_cuts  - list of score cuts to create rules
        outcomes    - list of outcomes associated with the rules
    
    Returns:
        a rule list to used when create detector
    """
    
    if len(score_cuts)+1 != len(outcomes):
        logging.error('Your socre cuts and outcomes are not matched.')
    
    rule_list = []
    for i in range(len(outcomes)):
        # rule expression
        if i < (len(outcomes)-1):
            rule = "${0} > {1}".format(MODEL_SCORE_NAME,score_cuts[i])
        else:
            rule = "${0} <= {1}".format(MODEL_SCORE_NAME,score_cuts[i-1])
    
        # append to rule_list (used when create detector)
        rule_id = "rules_{0}_{1}".format(i, MODEL_SCORE_NAME)
        
        rule_list.append({
            "ruleId": rule_id, 
            "ruleVersion" : '1',
            "detectorId"  : DETECTOR_NAME
        })
        
        # create rules
        print("creating rule: {0}: IF {1} THEN {2}".format(rule_id, rule, outcomes[i]))
        try:
            response = fraudDetector.create_rule(
                ruleId = rule_id,
                detectorId = DETECTOR_NAME,
                expression = rule,
                language = 'DETECTORPL',
                outcomes = [outcomes[i]]
                )
        except:
            print("this rule already exists in this detector")
            
    return rule_list

score_cuts = [0.9, 0.5]                         
outcomes = ['fraud', 'investigate', 'approve']  
create_outcomes(outcomes)
rule_list = create_rules(score_cuts, outcomes, f'sagemaker_xgb_score_{version_prefix}', DETECTOR_NAME)

creating outcome variable: fraud 
creating outcome variable: investigate 
creating outcome variable: approve 
creating rule: rules_0_sagemaker_xgb_score_v1: IF $sagemaker_xgb_score_v1 > 0.9 THEN fraud
creating rule: rules_1_sagemaker_xgb_score_v1: IF $sagemaker_xgb_score_v1 > 0.5 THEN investigate
creating rule: rules_2_sagemaker_xgb_score_v1: IF $sagemaker_xgb_score_v1 <= 0.5 THEN approve


In [20]:
# -- create detector version --
response =fraudDetector.create_detector_version(
    detectorId    = DETECTOR_NAME ,
    rules         = rule_list,
    externalModelEndpoints = [f'sagemaker-xgb-endpoint-{version_prefix}'],
    ruleExecutionMode = 'FIRST_MATCHED'
)

In [21]:
response = fraudDetector.update_detector_version_status(
    detectorId        = DETECTOR_NAME,
    detectorVersionId = '1',
    status            = 'ACTIVE'
)

In [22]:
test_data.head()

Unnamed: 0,EVENT_LABEL_fraud,order_amt,prev_amt
1459,0,156.0,135.04
13935,0,41.58,89.56
6843,0,21.4,404.08
17103,0,35.17,135.47
2286,0,91.72,122.84


### Step 3: Test the detector using boto3 SDK

In [23]:
pred = fraudDetector.get_event_prediction(
    detectorId        = f"afd-with-sagemaker-model-{version_prefix}",
    detectorVersionId = '1',
    eventId           = '1459',
    eventTypeName     = f'sagemaker-xgb-transaction-{version_prefix}',
    eventTimestamp    = '2019-10-05T22:50:48Z',
    entities          = [{
        'entityType': f'sagemaker-xgb-entity-{version_prefix}', 
        'entityId':"UNKNOWN"
    }],
    eventVariables    = {
        'order_amt': '156',
        'prev_amt':'135.04'
    }) 

In [24]:
pred

{'modelScores': [],
 'ruleResults': [{'ruleId': 'rules_2_sagemaker_xgb_score_v1',
   'outcomes': ['approve']}],
 'externalModelOutputs': [{'externalModel': {'modelEndpoint': 'sagemaker-xgb-endpoint-v1',
    'modelSource': 'SAGEMAKER'},
   'outputs': {'sagemaker_xgb_score_v1': '0.07167139649391174\n'}}],
 'ResponseMetadata': {'RequestId': '34c934b6-0030-4159-a7ff-06b7b3e5fa4d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 16:35:28 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '277',
   'connection': 'keep-alive',
   'x-amzn-requestid': '34c934b6-0030-4159-a7ff-06b7b3e5fa4d'},
  'RetryAttempts': 0}}