# <B> Setup for Nvidai NeMo with SageMaker </B>
* Container: codna_python3

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [2]:
install_needed = True  # should only be True once
# install_needed = False

In [3]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

Already revised


In [4]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker
    !{sys.executable} -m pip install -U datasets transformers
    !{sys.executable} -m pip install -U wget omegaconf text-unidecode sox
    
    ## Install NeMo
    !sudo yum install sox -y
    !sudo yum install libsndfile -y
    !pip install --upgrade --force-reinstall llvmlite
    BRANCH = 'main'
    !{sys.executable} -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
    
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting smdebug
  Downloading smdebug-1.0.12-py2.py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.1/270.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sagemaker-experiments
  Downloading sagemaker_experiments-0.1.43-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 k

## 1. Set roles

In [2]:
from sagemaker import get_execution_role

In [3]:
strSageMakerRoleName = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {strSageMakerRoleName}")

SageMaker Execution Role Name: AmazonSageMaker-ExecutionRole-20221206T163436


## 1.1 Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **EC2ContainerRegistry**: "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess" <BR>
> **S3**: "arn:aws:iam::aws:policy/AmazonS3FullAccess"

## 2. Set default parameters

In [4]:
import boto3
import sagemaker

### Bucket / Prefix 설정

In [5]:
strRegionName = boto3.Session().region_name
strAccountId = boto3.client("sts").get_caller_identity().get("Account")
bucket_name = 'sm-nemo-ramp' # <-- 사용할 bucket 명을 추가해 주세요. ex) sagemaker-us-east-1-123456789123, sm-nemo-bucket
prefix = 'nemo-asr' ## <-- 작업할 prefix 명을 추가해 주세요. ex) nemo-test, nemo-asr

## 3. Create training custom docker image

* docker build

In [None]:
!pygmentize custom-docker/Dockerfile

* Base 이미지의 region, account-id 확인 후 아래 파라미터 입력

### 1) AWS CLI 를 이용한 방식

In [None]:
%%bash
strRepositoryName="nemo-test-training"  ## <-- 원하는 docker repostory 이름을 추가
strDockerDir="./custom-docker/"
strTag="latest"

cd ${strDockerDir}
echo $(pwd)
container_name=${strRepositoryName}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
# region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${container_name}:${strTag}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${container_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${container_name}" > /dev/null
fi

# # Get the login command from ECR and execute it directly
# $(aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin "763104351884.dkr.ecr.us-west-2.amazonaws.com")

# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker build -f Dockerfile -t ${fullname} .
# docker tag ${container_name} ${fullname}

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
docker push ${fullname}
echo ${fullname}

In [6]:
strEcrRepositoryUri = '419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training'#' '  ## <-- 생성된 ECR의 URI를 넣어주세요.. ex) 123456789123.dkr.ecr.us-west-2.amazonaws.com/nemo-test:latest

### 2) AWS BOTO3 SDK를 이용한 방식

In [None]:
from utils.ecr import ecr_handler
ecr = ecr_handler()

In [None]:
strRepositoryName="nemo-test-training"  ## <-- 원하는 docker repostory 이름을 추가
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./custom-docker/"
strDockerFile = "Dockerfile"
strTag = "latest"

In [None]:
ecr.build_docker(strDockerDir, strDockerFile, strRepositoryName, strRegionName="us-west-2", strAccountId="763104351884")

* Push the image to ECR

In [None]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)

* Save image-uri to parameter store

## 4. Create inference custom docker image

* docker build

In [None]:
!pygmentize custom-docker/Dockerfile.inf

### 1) AWS CLI 를 이용한 방식

In [None]:
%%bash
strRepositoryName="nemo-test-inference"
strDockerDir="./custom-docker/"
strTag="latest"

cd ${strDockerDir}
echo $(pwd)
container_name=${strRepositoryName}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
# region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${container_name}:${strTag}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${container_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${container_name}" > /dev/null
fi

# # Get the login command from ECR and execute it directly
# $(aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin "763104351884.dkr.ecr.us-west-2.amazonaws.com")

# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker build -f Dockerfile.inf -t ${fullname} .
# docker tag ${container_name} ${fullname}

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
docker push ${fullname}
echo ${fullname}

In [7]:
strInfEcrRepositoryUri = '419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-inference'  ## <-- 생성된 ECR의 URI를 넣어주세요.. ex) 123456789123.dkr.ecr.us-west-2.amazonaws.com/nemo-test:latest


### 2) AWS BOTO3 SDK를 이용한 방식

In [33]:
from utils.ecr import ecr_handler
ecr = ecr_handler()

In [34]:
strInfRepositoryName="nemo-test-inference"  ## <-- 원하는 docker repostory 이름을 추가
strInfRepositoryName = strInfRepositoryName.lower()
strDockerFile = "Dockerfile.inf"
strDockerDir = "./custom-docker/"
strTag = "latest"

In [35]:
ecr.build_docker(strDockerDir, strDockerFile, strInfRepositoryName, strRegionName="us-west-2", strAccountId="763104351884")

/home/ec2-user/SageMaker/nemo-on-sagemaker/1.building-component
/home/ec2-user/SageMaker/nemo-on-sagemaker/1.building-component/custom-docker
strDockerFile Dockerfile.inf
aws ecr get-login --region 'us-west-2' --registry-ids '763104351884' --no-include-email




Login Succeeded

Sending build context to Docker daemon  10.75kB

Step 1/4 : From 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.13.1-gpu-py39
 ---> 58538dc47aa0
Step 2/4 : RUN pip install --no-cache-dir --upgrade pip  && pip install --no-cache-dir -U sagemaker nvgpu hydra-core librosa sentencepiece youtokentome inflect pyannote.audio  && pip install --no-cache-dir -U braceexpand webdataset editdistance jiwer jsonlines  && pip install --no-cache-dir pytorch-lightning==1.9.4  && pip install --no-cache-dir git+https://github.com/huggingface/transformers  && pip install --no-cache-dir git+https://github.com/NVIDIA/NeMo.git@main
 ---> Using cache
 ---> 9167df4989d0
Step 3/4 : COPY list_gpus.py /opt/conda/lib/python3.9/site-packages/nvgpu/list_gpus.py
 ---> Using cache
 ---> 72636f7b8709
Step 4/4 : WORKDIR /
 ---> Using cache
 ---> ea426d21de1c
Successfully built ea426d21de1c
Successfully tagged nemo-test-inference:latest

/home/ec2-user/SageMaker/nemo-on-sagemaker/1.build

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [36]:
strInfEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strInfRepositoryName, strTag)

== REGISTER AN IMAGE TO ECR ==
  processing_repository_uri: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-inference:latest
aws ecr get-login --region 'us-east-1' --registry-ids '419974056037' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

aws ecr create-repository --repository-name 'nemo-test-inference'
docker tag 'nemo-test-inference:latest' '419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-inference:latest'
docker push '419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-inference:latest'
== REGISTER AN IMAGE TO ECR ==


## 5. Download & Upload dataset

In [8]:
import os
import wget

In [9]:
data_dir = "./data"

In [10]:
print("******")
os.makedirs(data_dir, exist_ok=True)
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

******
Dataset downloaded at: ./data/an4_sphere.tar.gz


* updoad data to s3

### 1) AWS CLI 를 이용한 방식

In [11]:
!aws s3 sync $data_dir s3://$bucket_name/$prefix/data --quiet

### 2) AWS BOTO3 SDK를 이용한 방식

In [None]:
from utils.s3 import s3_handler

In [None]:
s3 = s3_handler()

In [None]:
source_dir, target_bucket, target_dir = data_dir, pm.get_params(key=prefix+"-BUCKET"), prefix+"/data"
s3.upload_dir(source_dir, target_bucket, target_dir)

## 6. Upload Pretrained model

In [12]:
pretrained = os.getcwd() + '/pretrained/CTC.nemo'

pretrained_s3uri = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "pretrained",
)

!aws s3 sync pretrained $pretrained_s3uri

upload: pretrained/CTC.nemo to s3://sm-nemo-ramp/nemo-asr/pretrained/CTC.nemo


## 7. CodeCommit 생성
- Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **CodeCommit**: "arn:aws:iam::aws:policy/AWSCodeCommitFullAccess"<BR>
> **SecretsManager**: "arn:aws:iam::aws:policy/SecretsManagerReadWrite"<BR>

### 5.1 CodeCommit 관련 Credentials 생성 및 Secret Manager에 저장하기
- CodeCommit Credentials

In [13]:
user_name = 'dongjin' ## ==> IAM에서 사용자 아이디 확인합니다.
codecommit_cred = 'codecommit-cred-'+user_name
codecommit_cred

'codecommit-cred-dongjin'

In [14]:
iam_client = boto3.client('iam')

In [15]:
try:
    response = iam_client.list_service_specific_credentials(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    if len(response['ServiceSpecificCredentials']) > 0:
        response = iam_client.delete_service_specific_credential(
            UserName=user_name,
            ServiceSpecificCredentialId=response['ServiceSpecificCredentials'][-1]['ServiceSpecificCredentialId']
        )
except:
    print("Create new codecommit crendentials")
    pass
finally:
    response = iam_client.create_service_specific_credential(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    ServiceUserName = response['ServiceSpecificCredential']['ServiceUserName']
    ServicePassword = response['ServiceSpecificCredential']['ServicePassword']
print(f"ServiceUserName : {ServiceUserName} \nServicePassword : {ServicePassword}")

ServiceUserName : dongjin-at-419974056037 
ServicePassword : wtLv/fP4ESjBDnyW5xgqFPGR0dMTIyK5/8gK6IS1Zsg=


In [16]:
code_repository_name = 'nemo-code' ## ==> 사용할 code repository 폴더 명을 넣습니다. ex) model_code
local_code_dir = './code' ## ==> 생성한 local의 code repository 폴더 명을 넣습니다. ex) code

In [17]:
codecommit = boto3.client('codecommit')

try:
    response = codecommit.create_repository(
        repositoryName=code_repository_name,
        repositoryDescription='Data Scientists share their training code using this Repository'
    )
except:
    
    print("Repository already exists")
    response = codecommit.get_repository(
        repositoryName=code_repository_name
    )

In [18]:
codecommit_repo = response['repositoryMetadata']['cloneUrlHttp']
codecommit_repo

'https://git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code'

In [None]:
!rm -rf .git/

In [23]:
!git init
!git remote add repo_codecommit $codecommit_repo
!git checkout -b main
!git add $local_code_dir
!git commit -m "code-update"
!git push --set-upstream repo_codecommit main

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /home/ec2-user/SageMaker/nemo-on-sagemaker/1.building-component/.git/
Switched to a new branch 'main'
[main (root-commit) 3c3cb2a] code-update
 Committer: EC2 Default User <ec2-user@ip-172-16-94-199.ec2.internal>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly:

    git config --global user.name "Your Name"
    git config --global user.

## 6. [Optional] AWS Systems Manager Parameter Store 를 이용한 파라미터 저장/활용
- [AWS Systems Manager Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html)
- Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **SSM**: "arn:aws:iam::aws:policy/AmazonSSMFullAccess"<BR>

In [19]:
from utils.ssm import parameter_store

In [20]:
pm = parameter_store(strRegionName)

In [26]:
pm.put_params(key="PREFIX", value=prefix, overwrite=True)
pm.put_params(key="-".join([prefix, "REGION"]), value=strRegionName, overwrite=True)
pm.put_params(key="-".join([prefix, "BUCKET"]), value=bucket_name, overwrite=True)
pm.put_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]), value=get_execution_role(), overwrite=True)
pm.put_params(key="-".join([prefix, "ACCOUNT-ID"]), value=strAccountId, overwrite=True)
pm.put_params(key="-".join([prefix, "IMAGE-URI"]), value=strEcrRepositoryUri, overwrite=True)
pm.put_params(key="-".join([prefix, "INF-IMAGE-URI"]), value=strInfEcrRepositoryUri, overwrite=True)
pm.put_params(key="-".join([prefix, "S3-DATA-PATH"]), value=f"s3://{bucket_name}/{prefix}/data", overwrite=True)
#pm.put_params(key="-".join([prefix, "CODE_REPO"]), value=codecommit_repo.replace('https://',''), overwrite=True)  ## https:// 있을 경우 입력 못함
pm.put_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), value=ServiceUserName, overwrite=True, enc=True)
pm.put_params(key="-".join([prefix, "CODECOMMIT-PWD"]), value=ServicePassword, overwrite=True, enc=True)
pm.put_params(key="-".join([prefix, "PRETRAINED-WEIGHT"]), value=pretrained_s3uri, overwrite=True)
pm.put_params(key="-".join([prefix, "RETRAIN"]), value=False, overwrite=True)

'Store suceess'

In [28]:
print (f'PREFIX: {pm.get_params(key="PREFIX")}')
print (f'REGION: {pm.get_params(key="-".join([prefix, "REGION"]))}')
print (f'BUCKET: {pm.get_params(key="-".join([prefix, "BUCKET"]))}')
print (f'SAGEMAKER-ROLE-ARN: {pm.get_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]))}')
print (f'ACCOUNT-ID: {pm.get_params(key="-".join([prefix, "ACCOUNT-ID"]))}')
print (f'IMAGE-URI: {pm.get_params(key="-".join([prefix, "IMAGE-URI"]))}')
print (f'INF-IMAGE-URI: {pm.get_params(key="-".join([prefix, "INF-IMAGE-URI"]))}')
print (f'S3-DATA-PATH: {pm.get_params(key="-".join([prefix, "S3-DATA-PATH"]))}')
print (f'CODE_REPO: {pm.get_params(key="-".join([prefix, "CODE_REPO"]))}')
print (f'CODECOMMIT-USERNAME: {pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=False)}')
print (f'CODECOMMIT-PWD: {pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=False)}')
print (f'PRETRAINED-WEIGHT: {pm.get_params(key="-".join([prefix, "PRETRAINED-WEIGHT"]))}')
print (f'RETRAIN: {pm.get_params(key="-".join([prefix, "RETRAIN"]))}')

PREFIX: nemo-asr
REGION: us-east-1
BUCKET: sm-nemo-ramp
SAGEMAKER-ROLE-ARN: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436
ACCOUNT-ID: 419974056037
IMAGE-URI: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training
INF-IMAGE-URI: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-inference
S3-DATA-PATH: s3://sm-nemo-ramp/nemo-asr/data
CODE_REPO: git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code
CODECOMMIT-USERNAME: AQICAHixC/mZVJcnQEHIgIK/d13m2pRN5MnNJb7dfKPh/9fZ0QGVuZrBQ7W5B/Cj1oZume/1AAAAdTBzBgkqhkiG9w0BBwagZjBkAgEAMF8GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMtW7mBk1BA6fgi+0VAgEQgDJGMPSMIYkgaFpV+HE3Rbca28eOkuoG5BqDwQwZzdd/KXD16cp+Sx+27D9TqE2ZAXGy7g==
CODECOMMIT-PWD: AQICAHixC/mZVJcnQEHIgIK/d13m2pRN5MnNJb7dfKPh/9fZ0QHUzV+0Wm5yTR1IjSI8FYX4AAAAizCBiAYJKoZIhvcNAQcGoHsweQIBADB0BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDGD2gd38cN8IbiI3eQIBEIBHLxltBZUR2+CxagsfFsh2EbMN2NA3s0L+aoh/LWu8hae0KAkN9HSn92gXO6rp0y2602plsH8UGiSM4tEWbXmk2uQWR+bufBA=


In [None]:
https://git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code
f'https://git-codecommit.${region}.amazonaws.com/v1/repos/informer2020'