## Step 1: Import Packages and Declare Constants

In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

In [2]:
#Replace this value with the S3 Bucket Created
default_bucket = "customer-churn-sm-pipeline"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

## Step 2: Generate Baseline Dataset

Baseline Data will be used as part of SageMaker Clarify Step to generate SHAP Values 

In [4]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [5]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [6]:
pd.DataFrame(baseline_sample).to_csv("data/baseline.csv",header=False,index=False)

## Step 3: Generate Batch Dataset

In [7]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [8]:
pd.DataFrame(batch_sample).to_csv("data/batch.csv",header=False,index=False)

## Step 4: Copy Data and Scripts to S3 Bucket

In [9]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv","data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv","data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv","input/baseline/baseline.csv")

In [10]:
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/preprocess.py","input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/evaluate.py","input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/generate_config.py","input/code/generate_config.py")

## Step 5: Get the Pipeline Instance

In [11]:
from pipelines.customerchurn.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

The class JsonGet has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [12]:
pipeline.definition()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://customer-churn-sm-pipeline/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://customer-churn-sm-pipeline/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri": "683313688378.dkr.ecr.us-ea

## Step 5: Submit the pipeline to SageMaker and start execution

In [13]:
pipeline.upsert(role_arn=role)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:pipeline/churnmodelsmpipeline',
 'ResponseMetadata': {'RequestId': '289e2b09-2c2d-432c-9394-5d73428e8f27',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '289e2b09-2c2d-432c-9394-5d73428e8f27',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sat, 25 Sep 2021 03:38:12 GMT'},
  'RetryAttempts': 0}}

Start Pipeline Execution

In [14]:
execution = pipeline.start()

Now we describe execution instance and list the steps in the execution to find out more about the execution.

In [15]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:pipeline/churnmodelsmpipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:pipeline/churnmodelsmpipeline/execution/huy1iar0jtwu',
 'PipelineExecutionDisplayName': 'execution-1632541093204',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2021, 9, 25, 3, 38, 13, 113000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 9, 25, 3, 38, 13, 113000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:user-profile/d-rnqglxiieyuf/<username>',
  'UserProfileName': '<username>',
  'DomainId': 'd-rnqglxiieyuf'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:user-profile/d-rnqglxiieyuf/<username>',
  'UserProfileName': '<username>',
  'DomainId': 'd-rnqglxiieyuf'},
 'ResponseMetadata': {'RequestId': 'ead2ffdf-3bc7-40d7-8dc5-8e77574d0b3a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ead2ffd

We can list the execution steps to check out the status and artifacts:

In [19]:
execution.list_steps()

[{'StepName': 'ClarifyProcessingStep',
  'StartTime': datetime.datetime(2021, 9, 25, 3, 54, 44, 485000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 9, 25, 4, 15, 31, 394000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:processing-job/pipelines-huy1iar0jtwu-clarifyprocessingste-neysrivlcs'}}},
 {'StepName': 'ChurnModelConfigFile',
  'StartTime': datetime.datetime(2021, 9, 25, 3, 50, 34, 762000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 9, 25, 3, 54, 44, 326000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:XXXXXXXXXX:processing-job/pipelines-huy1iar0jtwu-churnmodelconfigfile-ngwnp2qkv3'}}},
 {'StepName': 'ChurnTransform',
  'StartTime': datetime.datetime(2021, 9, 25, 3, 50, 34, 679000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 9, 25, 3, 56, 3, 303000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded