# 1. SageMaker Training with Experiments and Processing For AutoGluon

## 학습 작업의 실행 노트북 개요

- SageMaker Training에 SageMaker 실험을 추가하여 여러 실험의 결과를 비교할 수 있습니다.
 - [작업 실행 시 필요 라이브러리 import](#작업-실행-시-필요-라이브러리-import)
 - [SageMaker 세션과 Role, 사용 버킷 정의](#SageMaker-세션과-Role,-사용-버킷-정의)
 - [하이퍼파라미터 정의](#하이퍼파라미터-정의)
 - [학습 실행 작업 정의](#학습-실행-작업-정의)
 - 학습 코드 명
 - 학습 코드 폴더 명
 - 학습 코드가 사용한 Framework 종류, 버전 등
 - 학습 인스턴스 타입과 개수
 - SageMaker 세션
 - 학습 작업 하이퍼파라미터 정의
 - 학습 작업 산출물 관련 S3 버킷 설정 등
 - [학습 데이터셋 지정](#학습-데이터셋-지정)
 - 학습에 사용하는 데이터셋의 S3 URI 지정
 - [SageMaker 실험 설정](#SageMaker-실험-설정)
 - [학습 실행](#학습-실행)
 - [데이터 세트 설명](#데이터-세트-설명)
 - [실험 결과 보기](#실험-결과-보기)
 - [Evaluation 하기](#Evaluation-하기)

### 작업 실행 시 필요 라이브러리 import

In [None]:
!pip install -U sagemaker-experiments

In [None]:
import os
import sys
import json
import pandas as pd
import boto3
import sagemaker

In [None]:
sys.path.append('./src')

In [None]:
from ag_model import (
 AutoGluonTraining,
 AutoGluonInferenceModel,
 AutoGluonTabularPredictor,
 AutoGluonFramework
)

### SageMaker 세션과 Role, 사용 버킷 정의

In [None]:
sagemaker_session = sagemaker.session.Session()
region = sagemaker_session._region_name
role = sagemaker.get_execution_role()

In [None]:
bucket = sagemaker_session.default_bucket()
code_location = f's3://{bucket}/autogluon/code'
output_path = f's3://{bucket}/autogluon/output'

### 하이퍼파라미터 정의

In [None]:
hyperparameters = {
 "config_name" : "config-med.yaml"
}

### 학습 데이터셋 지정

In [None]:
data_path=f's3://{bucket}/autogluon/dataset'
config_path = f's3://{bucket}/autogluon/config'
!aws s3 sync ../data/dataset/ $data_path
!aws s3 sync ./config/ $config_path

data_path

### 학습 실행 작업 정의

In [None]:
instance_count = 1
instance_type = "ml.m5.large"
# instance_type = 'local'
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
 max_wait = 1*60*60
else:
 max_wait = None

In [None]:
if instance_type == 'local':
 from sagemaker.local import LocalSession
 
 sagemaker_session = LocalSession()
 sagemaker_session.config = {'local': {'local_code': True}}
 local_data_path = "file://" + os.getcwd().replace('/lab_1_training', '') + "/data/dataset"
 
 data_channels = {
 "inputdata": local_data_path, 
 "config" : "file://" + os.getcwd() + '/config'
 }
 
else:
 sess = boto3.Session()
 sagemaker_session = sagemaker.Session()
 sm = sess.client('sagemaker')
 
 data_channels = {
 "inputdata": data_path, 
 "config" : config_path
 }

In [None]:
ag_estimator = AutoGluonTraining(
 entry_point="autogluon_starter_script.py",
 source_dir=os.getcwd() + "/src",
 role=role,
 # region=region,
 sagemaker_session=sagemaker_session,
 output_path=output_path,
 code_location=code_location,
 hyperparameters=hyperparameters,
 instance_count=instance_count,
 instance_type=instance_type,
 framework_version="0.4",
 py_version="py38",
 max_run=max_run,
 use_spot_instances=use_spot_instances, # spot instance 활용
 max_wait=max_wait,
)

### SageMaker 실험 설정

In [None]:
experiment_name='autogluon-poc-1'

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from time import strftime

In [None]:
def create_experiment(experiment_name):
 try:
 sm_experiment = Experiment.load(experiment_name)
 except:
 sm_experiment = Experiment.create(experiment_name=experiment_name)

In [None]:
def create_trial(experiment_name):
 create_date = strftime("%m%d-%H%M%s")

 sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
 experiment_name=experiment_name)

 job_name = f'{sm_trial.trial_name}'
 return job_name

### 학습 실행

In [None]:
data_channels

In [None]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name)

ag_estimator.fit(inputs = data_channels,
 job_name = job_name,
 experiment_config={
 'TrialName': job_name,
 'TrialComponentDisplayName': job_name,
 },
 wait=False)

In [None]:
ag_estimator.logs()

### 실험 결과 보기
위의 실험한 결과를 확인 합니다.
- 각각의 훈련잡의 시도에 대한 훈련 사용 데이터, 모델 입력 하이퍼 파라미터, 모델 평가 지표, 모델 아티펙트 결과 위치 등의 확인이 가능합니다.
- **아래의 모든 내용은 SageMaker Studio 를 통해서 직관적으로 확인이 가능합니다.**

In [None]:
!rm -rf ./autogluon/
!mkdir -p ./autogluon/result
!aws s3 cp {ag_estimator.model_data} ./autogluon/

In [None]:
!ls -alF ./autogluon/model.tar.gz

In [None]:
!tar -xzf ./autogluon/model.tar.gz -C ./autogluon/result/

### Endpoint Deployment

In [None]:
instance_type = "ml.m5.2xlarge"
# instance_type = 'local'

In [None]:
if instance_type == 'local':
 from sagemaker.local import LocalSession
 sagemaker_session = LocalSession()
 sagemaker_session.config = {'local': {'local_code': True}}
else:
 sess = boto3.Session()
 sagemaker_session = sagemaker.Session()

In [None]:
model = AutoGluonInferenceModel(
 source_dir=os.getcwd() + "/src",
 entry_point="autogluon_serve.py",
 model_data=ag_estimator.model_data,
 instance_type=instance_type,
 role=role,
 sagemaker_session=sagemaker_session,
 # region=region,
 framework_version="0.4",
 py_version="py38",
 predictor_cls=AutoGluonTabularPredictor
)

In [None]:
from sagemaker.serializers import CSVSerializer

predictor = model.deploy(
 initial_instance_count=1, serializer=CSVSerializer(), instance_type=instance_type
)

### Predict on unlabeled test data

Remove target variable (`fraud`) from the data and get predictions for a sample of 100 rows using the deployed endpoint.

In [None]:
df = pd.read_csv("../data/dataset/test.csv")
data = df.drop(columns="fraud")[:100].values

In [None]:
preds = predictor.predict(data)
pred_df = pd.DataFrame(json.loads(preds))

In [None]:
pred_df['fraud'].reset_index(drop=True, inplace=True)
df["fraud"][:len(pred_df)].reset_index(drop=True, inplace=True)

In [None]:
p = pd.DataFrame({"preds": pred_df['fraud'], "actual": df["fraud"][: len(pred_df)]})
p.head()

In [None]:
print(f"{(p.preds==p.actual).astype(int).sum()}/{len(p)} are correct")

### Cleanup Endpoint

In [None]:
# predictor.delete_endpoint()

# Batch Transform

학습된 모델을 호스트된 엔드포인트에 배포하는 것은 출시 이후 SageMaker에서 사용할 수 있으며 웹 사이트나 모바일 앱과 같은 서비스에 실시간 예측을 제공하는 좋은 방법입니다. 그러나 지연 시간을 최소화하는 것이 문제가 되지 않는 대규모 데이터 세트에서 학습된 모델에서 예측을 생성하는 것이 목표라면 배치 변환 기능이 더 쉽고, 더 확장 가능하며, 더 적절할 수 있다.

[Read more about Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html).

In [None]:
instance_type = "ml.m5.2xlarge"

In [None]:
# model = AutoGluonInferenceModel(
# source_dir=os.getcwd() + "/src",
# entry_point="autogluon_serve.py",
# model_data=ag_estimator.model_data,
# instance_type=instance_type,
# role=role,
# sagemaker_session=sagemaker_session,
# region=region,
# framework_version="0.4",
# py_version="py38", 
# predictor_cls=AutoGluonTabularPredictor,
# )

In [None]:
transformer = model.transformer(
 instance_count=1,
 instance_type=instance_type,
 strategy="MultiRecord",
 max_payload=6,
 max_concurrent_transforms=1,
 output_path=output_path,
 accept="application/json",
 assemble_with="Line",
)


Prepare data for batch transform

In [None]:
pd.read_csv(f"../data/dataset/test.csv")[:100].to_csv("../data/dataset/test_no_header.csv", header=False, index=False)

In [None]:
test_input = transformer.sagemaker_session.upload_data(
 path=os.path.join("../data/dataset", "test_no_header.csv"), key_prefix=f"{bucket}/autogluon/dataset"
)
test_input

In [None]:
transformer.transform(
 test_input,
 input_filter="$[1:]", # filter-out target variable
 split_type="Line",
 content_type="text/csv",
 output_filter="$['fraud']", # keep only prediction class in the output
)

transformer.wait()

batch transform 결과를 다운로드 받습니다.

In [None]:
!rm -rf ./autogluon_batch_result
!mkdir ./autogluon_batch_result

In [None]:
transformer.output_path

In [None]:
!aws s3 cp {transformer.output_path}/test_no_header.csv.out ./autogluon_batch_result/

In [None]:
p = pd.concat(
 [
 pd.read_json("./autogluon_batch_result/test_no_header.csv.out", orient="index")
 .sort_index()
 .rename(columns={0: "preds"}),
 pd.read_csv("../data/dataset/test.csv")[["fraud"]].iloc[:100].rename(columns={"fraud": "actual"}),
 ],
 axis=1,
)
p.head()

In [None]:
print(f"{(p.preds==p.actual).astype(int).sum()}/{len(p)} are correct")

### Processing Evaluation 하기
SageMaker Processing을 이용하여 Evalution을 수행하는 코드를 동작할 수 있습니다. MLOps에서 Processing을 적용하면 전처리, Evaluation 등을 serverless로 동작할 수 있습니다.

In [None]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.estimator import Framework

In [None]:
instance_count = 1
instance_type = "ml.m5.large"
# instance_type = 'local'

In [None]:
from sagemaker import image_uris

image_uri = image_uris.retrieve(
 "autogluon",
 region=region,
 version="0.4",
 py_version="py38",
 image_scope="training",
 instance_type=instance_type,
)
image_uri


In [None]:
script_eval = FrameworkProcessor(
 AutoGluonFramework,
 framework_version="0.4",
 role=role,
 py_version="py38",
 image_uri=image_uri,
 instance_type=instance_type,
 instance_count=instance_count
)

In [None]:
detect_outputpath = f's3://{bucket}/autogluon/processing'

In [None]:
source_dir='src'

if instance_type == 'local':
 from sagemaker.local import LocalSession
 from pathlib import Path
 
 sagemaker_session = LocalSession()
 sagemaker_session.config = {'local': {'local_code': True}}
 source_dir = f'{Path.cwd()}/src'
 s3_test_path=f'../data/dataset/test.csv'
else:
 sagemaker_session = sagemaker.Session()
 s3_test_path = data_path + '/test.csv'

In [None]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name)

script_eval.run(
 code="autogluon_evaluation.py",
 source_dir=source_dir,
 inputs=[ProcessingInput(source=s3_test_path, input_name="test_data", destination="/opt/ml/processing/test"),
 ProcessingInput(source=ag_estimator.model_data, input_name="model_weight", destination="/opt/ml/processing/model")
 ],
 outputs=[
 ProcessingOutput(source="/opt/ml/processing/output", output_name='evaluation', destination=detect_outputpath + "/" + job_name),
 ],
 job_name=job_name,
 experiment_config={
 'TrialName': job_name,
 'TrialComponentDisplayName': job_name,
 },
 wait=False
)

In [None]:
script_eval.latest_job.wait()

### Code repository 생성 및 push
현재 사용하는 노트북의 iam role에 IAMFullAccess을 추가한 이후에 아래 작업을 수행합니다.

In [None]:
from sagemaker import get_execution_role

In [None]:
iam_client = boto3.client('iam')

role=get_execution_role()
base_role_name=role.split('/')[-1]

iam_client.attach_role_policy(
 RoleName=base_role_name,
 PolicyArn='arn:aws:iam::aws:policy/AWSCodeCommitFullAccess'
)

In [None]:
codecommit = boto3.client('codecommit')
repository_name = 'autogluon_code'

try:
 response = codecommit.create_repository(
 repositoryName=repository_name,
 repositoryDescription='Data Scientists share their training code using this Repository'
 )
except:
 
 print("Repository already exists")
 response = codecommit.get_repository(
 repositoryName=repository_name
 )

In [None]:
codecommit_repo = response['repositoryMetadata']['cloneUrlHttp']
codecommit_repo

In [None]:
!git init
!git remote add repo_codecommit $codecommit_repo
!git checkout -b main
!git add ./config ./src ./1.SageMaker-Training+Experiments+Processing-AutoGluon.ipynb
!git commit -m "autogluon-update"
!git push --set-upstream repo_codecommit main