# <B> Training </B>
* Container: codna_pytorch_p39

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 1. parameter store 설정

In [2]:
import boto3
from utils.ssm import parameter_store

In [3]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
prefix = pm.get_params(key="PREFIX")

## 2.Training-job for preprocessing

In [4]:
import os
import sagemaker
from omegaconf import OmegaConf
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor

* **Set Up SageMaker Experiment**
    - Create or load [SageMaker Experiment](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) for the example training job. This will create an experiment trial object in SageMaker.

In [5]:
from time import strftime
from smexperiments.trial import Trial
from smexperiments.experiment import Experiment

In [6]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name)

In [7]:
def create_trial(experiment_name):
    create_date = strftime("%m%d-%H%M%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

* **Configure the training job**

    - Now we configure the training job, by modifying the `config.yaml` file that is stored in our source code directory.
    - We pass relative directory paths for the data based on the SageMaker mount directory on the remote instance.

In [8]:
code_dir = "./code"
config_dir = "./code/conf"
config_path = os.path.join(config_dir, "config.yaml")

* params for training job

In [19]:
# Set to True to enable SageMaker to run locally
local_mode = False


if local_mode:
    instance_type = "local_gpu"
    
    from sagemaker.local import LocalSession
    import os
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    local_data_dir = os.getcwd() + '/data/preprocessing'
    pretrained = os.getcwd() + '/pretrained/CTC.nemo'
    
    data_channels = {"training": f"file://{local_data_dir}", "testing": f"file://{local_data_dir}", "pretrained": f"file://{pretrained}"}
    
else:
    
    instance_type = "ml.g4dn.8xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    data_channels = {"training": pm.get_params(key="-".join([prefix, "PREP-DATA-PATH"])), 
                     "testing": pm.get_params(key="-".join([prefix, "PREP-DATA-PATH"])), 
                     "pretrained": pm.get_params(key=prefix + "-PRETRAINED-WEIGHT")}

instance_count = 1
do_spot_training = False
max_wait = None
max_run = 1*60*60   

resume = True
    
proc_prefix = "/opt/ml/processing"
bucket_name = pm.get_params(key=prefix + "-BUCKET")

output_path = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "training",
    "model-output"
)

code_location = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "training",
    "backup_codes"
)

experiment_name = '-'.join([prefix, "nemo-experiments"])

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"loss={num_re}"},
    {"Name": "wer", "Regex": f"wer:{num_re}"}
]

git_config = {
    'repo': f'https://{pm.get_params(key="-".join([prefix, "CODE_REPO"]))}',
    'branch': 'main',
    'username': pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=True),
    'password': pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=True)
}  

kwargs = {}

In [20]:
print("experiment_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {}\ndata_channels : {}\nexperiment_name : {}\n git_config : {}".format(experiment_name, instance_type, instance_count, data_channels, experiment_name, git_config))    

experiment_name : nemo-asr-nemo-experiments 
train_instance_type : ml.g4dn.8xlarge 
train_instance_count : 1
data_channels : {'training': 's3://sm-nemo-ramp/nemo-asr/preprocessing/data', 'testing': 's3://sm-nemo-ramp/nemo-asr/preprocessing/data', 'pretrained': 's3://sm-nemo-ramp/nemo-asr/pretrained'}
experiment_name : nemo-asr-nemo-experiments
 git_config : {'repo': 'https://git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code', 'branch': 'main', 'username': 'dongjin-at-419974056037', 'password': 'wtLv/fP4ESjBDnyW5xgqFPGR0dMTIyK5/8gK6IS1Zsg='}


* config

In [21]:
conf = OmegaConf.load(config_path)

# Sampling
conf.model.sample_rate = 16000

# Set Data Locations based on the mounted directory in the SageMaker instance
conf.model.train_ds.manifest_filepath = "/opt/ml/input/data/training/an4/train_manifest.json"
conf.model.validation_ds.manifest_filepath = "/opt/ml/input/data/testing/an4/test_manifest.json"
# training setup
conf.trainer.accelerator = "gpu"
conf.trainer.num_nodes = instance_count
# enable SageMaker DDP
conf.trainer.strategy = None #"ddp"
conf.trainer.max_epochs = 2

# Output directory for our experiment within the SageMaker instance
conf.exp_manager.exp_dir="/opt/ml/model/"

# Create a Small Variant of the Conformer Model
conf.model.encoder.n_layers = 8
conf.model.n_heads = 4
conf.model.spec_augment.time_masks = 5

# Set Optimizer parameters
conf.model.optim.lr = 2.0 # by default we using Noam scheduling, the LR is a multiplier

if resume == False:
    # resume flags if crashes occur
    conf.exp_manager.resume_if_exists=False 
    conf.exp_manager.resume_ignore_no_checkpoint=False
    conf.init_from_nemo_model = None
    
else:
    # resume flags if crashes occur
    conf.exp_manager.resume_if_exists=True
    conf.exp_manager.resume_ignore_no_checkpoint=True
    # the pre-trained model we want to fine-tune
    conf.init_from_nemo_model = "/opt/ml/input/data/pretrained/CTC.nemo"
    

OmegaConf.save(conf, config_path)

* Define processing job

In [13]:
pm.get_params(key=''.join([prefix, "-IMAGE-URI"])), code_dir

('419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training', './code')

In [22]:
est = PyTorch(
    entry_point="speech_to_text_ctc.py", # the script we want to run
    source_dir=code_dir, # where our conf/script is
    git_config=git_config,
    role=pm.get_params(key=prefix + "-SAGEMAKER-ROLE-ARN"),
    instance_type=instance_type,
    instance_count=instance_count,
    image_uri=pm.get_params(key=''.join([prefix, "-IMAGE-URI"])),
    # framework_version="1.13.1", # version of PyTorch
    # py_version="py39",
    volume_size=256,
    code_location = code_location,
    output_path=output_path,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters={'config-path': 'conf'},
    #distribution={"smdistributed":{"dataparallel":{"enabled":True, "fp16": True}}},
    sagemaker_session=sagemaker_session,
    metric_definitions=metric_definitions,
    max_run=max_run,
    enable_sagemaker_metrics=True,
    **kwargs
)

* run

In [23]:
if instance_type =='local_gpu': est.checkpoint_s3_uri = None

create_experiment(experiment_name)
job_name = create_trial(experiment_name)

est.fit(
    inputs=data_channels, 
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=True,
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Cloning into '/tmp/tmplptyic8i'...
remote: Counting objects: 20, done.        
Already on 'main'


Your branch is up to date with 'origin/main'.


INFO:sagemaker:Creating training-job with name: nemo-asr-nemo-experiments-0322-10521679482352


2023-03-22 10:52:46 Starting - Starting the training job...
2023-03-22 10:53:00 Starting - Preparing the instances for training......
2023-03-22 10:54:01 Downloading - Downloading input data...
2023-03-22 10:54:26 Training - Downloading the training image..................
2023-03-22 10:57:53 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-22 10:58:35,669 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-22 10:58:35,689 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-22 10:58:35,700 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-22 10:58:35,702 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-03-22 10:58:35,9

* save model-path, experiment-name

In [24]:
pm.put_params(key="-".join([prefix, "MODEL-PATH"]), value=est.model_data, overwrite=True)
pm.put_params(key="-".join([prefix, "EXPERI-NAME"]), value=experiment_name, overwrite=True)

'Store suceess'

In [25]:
job_name

'nemo-asr-nemo-experiments-0322-10521679482352'

* show experiments

In [132]:
from sagemaker.analytics import ExperimentAnalytics
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_rows = 10
pd.options.display.max_colwidth = 100

In [133]:
trial_component_training_analytics = ExperimentAnalytics(
    sagemaker_session= sagemaker_session,
    experiment_name= experiment_name,
    #sort_by="metrics.validation:auc.max",        
    #sort_order="Descending",
    #metric_names=["validation:auc"]
)

trial_component_training_analytics.dataframe()[['Experiments', 'Trials', \
                                                'wer - Last', 'wer - Min', 'wer - Max', 'wer - Avg', \
                                                'train_loss - Last', 'train_loss - Min', 'train_loss - Max', 'train_loss - Avg']]

AttributeError: 'LocalSagemakerClient' object has no attribute 'search'

In [None]:
from sagemaker.analytics import TrainingJobAnalytics
df = TrainingJobAnalytics( training_job_name=job_name).dataframe()

In [53]:
df

Unnamed: 0,timestamp,metric_name,value
0,0.0,train_loss,76.866667
1,0.0,wer,1170.0


## code pipeline trigger

In [134]:
import boto3
pipeline_client = boto3.client('codepipeline')

In [None]:
import boto3
pipeline_client = boto3.client('codepipeline')
code_pipeline_name = "mlops-code-pipeline"
pipeline_client.start_pipeline_execution(name=code_pipeline_name)