In [7]:
# Set Up Environment

import boto3
import sagemaker
import sagemaker.session


region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

print(f"Region: {region}")
print(f"Role: {role}")
print(f"Default Bucket: {default_bucket}")

Region: us-west-2
Role: arn:aws:iam::933988069619:role/service-role/AmazonSageMaker-ExecutionRole-20230602T153402
Default Bucket: sagemaker-us-west-2-933988069619


In [8]:
# 1. Prepare Dataset

train_data_uri="s3://app-ml-dataset-0410/flowers"
test_data_uri=train_data_uri

print(f"Train Data Uri: {train_data_uri}")
print(f"Test Data Uri: {test_data_uri}")

Train Data Uri: s3://app-ml-dataset-0410/flowers
Test Data Uri: s3://app-ml-dataset-0410/flowers


In [9]:
# 2. Define Pipeline Paramters

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)
train_data = ParameterString(
    name="TrainData",
    default_value=train_data_uri,
)
test_data = ParameterString(
    name="TestData",
    default_value=test_data_uri,
)
epoch_data = ParameterInteger(
    name="Epoch",
    default_value=10
)
batch_size_data = ParameterInteger(
    name="BatchSize",
    default_value=16
)
learning_rate_data = ParameterFloat(
    name="LearningRate",
    default_value=0.0001
)

In [10]:
# 3. Define Training Step

import tensorflow as tf
from sagemaker.tensorflow import TensorFlow, TrainingCompilerConfig
print(f"tf version: {tf.__version__}")

# an updated max batch size that can fit into GPU memory with compiler
epochs=epoch_data.to_string()
batch_size=batch_size_data.to_string()
learning_rate=learning_rate_data.to_string()

# hyperparameters
hyperparameters={
#     "n_gpus": 1,
    "epochs": epochs,
    "batch-size": batch_size,
    "learning-rate": learning_rate
}

# estimator
tf_estimator=TensorFlow(
    source_dir='src',
    entry_point='train.py',
    dependencies=[],
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    framework_version='2.11.0',
    role=role,
    hyperparameters=hyperparameters,
    sagemaker_session=sagemaker_session,
    compiler_config=TrainingCompilerConfig(),
    py_version="py39",
    disable_profiler=True,
    metric_definitions=[
        {"Name": "training_loss", "Regex": "loss: ([0-9.]*?) "},
        {"Name": "training_accuracy", "Regex": "accuracy: ([0-9.]*?) "},
        {"Name": "validation_loss", "Regex": "val_loss: ([0-9.]*?) "},
        {"Name": "validation_accuracy", "Regex": "val_accuracy: ([0-9.]*?)$"}
    ],
    base_job_name="app-flower-classifier-model"
)

2023-06-04 07:55:29.476643: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tf version: 2.12.0


In [11]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.properties import Properties

step_train = TrainingStep(
    name="Train",
    estimator=tf_estimator,
    inputs={
        "train": TrainingInput(
            # s3_data=train_data.default_value,
            s3_data=train_data.to_string(),
            content_type="image/*"
        ),
        "test": TrainingInput(
            # s3_data=test_data.default_value,
            s3_data=test_data.to_string(),
            content_type="image/*"
        )
    },
)

In [12]:
# 4. Create Model

from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.steps import CreateModelStep

model = Model(
    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference:2.11.0-cpu", 
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    predictor_cls=sagemaker.predictor.RealTimePredictor,
    role=role
)


inputs = CreateModelInput(
    instance_type="ml.m5.large",
)

step_create_model = CreateModelStep(
    name="CreateModel",
    model=model,
    inputs=inputs,
)

In [13]:
# 5. Create a pipeline

from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"AppMlPipeline-Model"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        train_data,
        test_data,
        epoch_data,
        batch_size_data,
        learning_rate_data
    ],
    steps=[step_train, step_create_model],
)

In [14]:
# 6. Run a pipeline
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:933988069619:pipeline/AppMlPipeline-Model',
 'ResponseMetadata': {'RequestId': 'c58d4397-821d-4850-9b00-6f605bfd0b0d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c58d4397-821d-4850-9b00-6f605bfd0b0d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '87',
   'date': 'Sun, 04 Jun 2023 07:55:32 GMT'},
  'RetryAttempts': 0}}

In [15]:
# 6. Describe pipeline
import json

json.loads(pipeline.definition())

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainData',
   'Type': 'String',
   'DefaultValue': 's3://app-ml-dataset-0410/flowers'},
  {'Name': 'TestData',
   'Type': 'String',
   'DefaultValue': 's3://app-ml-dataset-0410/flowers'},
  {'Name': 'Epoch', 'Type': 'Integer', 'DefaultValue': 10},
  {'Name': 'BatchSize', 'Type': 'Integer', 'DefaultValue': 16},
  {'Name': 'LearningRate', 'Type': 'Float', 'DefaultValue': 0.0001}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Train',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.11.0-gpu-py39',
     'MetricDefinitions': [{'Name': 'training_loss',
       'Regex': 'loss: ([0-9.]*?) '},
      {'Name': 'training_accuracy', 'Regex': 'accuracy: ([0-9.]*?) '},
      {'Name':