#!/usr/bin/env python # Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. # A sample training component that trains a simple scikit-learn decision tree model. # This implementation works in File mode and makes no assumptions about the input file names. # Input is specified as CSV with a data point in each row and the labels in the first column. from __future__ import print_function import os import json import sys import subprocess import traceback # These are the paths to where SageMaker mounts interesting things in your container. prefix = '/opt/ml/' workspace = '/workspace/bert' input_path = os.path.join(prefix,'input/data') output_path = os.path.join(prefix, 'output') model_path = os.path.join(workspace, 'bert_base.pt') param_path = os.path.join(prefix, 'input/config/hyperparameters.json') # point this to # This algorithm has a single channel of input data called 'training'. Since we run in # File mode, the input files are copied to the directory specified here. channel_name = 'training' training_path = os.path.join(input_path, channel_name) # default params training_script = 'run_squad.py' default_params = ['--init_checkpoint', str(model_path), '--do_train', '--do_lower_case', '--fp16'] # Execute your training algorithm. def _run(cmd): """Invokes your training algorithm.""" process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ) stdout, stderr = process.communicate() return_code = process.poll() if return_code: error_msg = 'Return Code: {}, CMD: {}, Err: {}'.format(return_code, cmd, stderr) raise Exception(error_msg) def _hyperparameters_to_cmd_args(hyperparameters): """ Converts our hyperparameters, in json format, into key-value pair suitable for passing to our training algorithm. """ cmd_args_list = [] for key, value in hyperparameters.items(): cmd_args_list.append('--{}'.format(key)) cmd_args_list.append(value) return cmd_args_list if __name__ == '__main__': try: # Amazon SageMaker makes our specified hyperparameters available within the # /opt/ml/input/config/hyperparameters.json. # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container if os.path.exists('/opt/ml/input/data/train/'): os.system('cp /opt/ml/input/data/train/* /workspace/bert/data/squad') else: print(os.system('ls /opt/ml/input/')) print(os.system('echo ------------')) print(os.system('ls /opt/ml/input/data/')) print(os.system('echo ------------')) print(os.system('ls /opt/ml/input/data/train')) print('downloading data...') os.system('cd /workspace/bert/data/squad/ && bash squad_download.sh') with open(param_path, 'r') as tc: hyperparams = json.load(tc) print(hyperparams) python_executable = sys.executable num_gpus = hyperparams['num_gpus'] sage_params = {} training_params = {} for t in hyperparams: if(t[:4]=='sage'): sage_params[t] = hyperparams[t] else: training_params[t] = hyperparams[t] if(int(training_params['num_gpus'])>1): del training_params['num_gpus'] cmd_args = _hyperparameters_to_cmd_args(training_params) train_cmd = [python_executable,'-m', 'torch.distributed.launch', f'--nproc_per_node={num_gpus}', training_script] + default_params + cmd_args else: del training_params['num_gpus'] cmd_args = _hyperparameters_to_cmd_args(training_params) train_cmd = [python_executable, training_script] + default_params + cmd_args #+ sage_params['sagemaker_submit_directory'] print('train_cmd: ',train_cmd) #_run(train_cmd) os.system(" ".join(train_cmd)) print('Training complete.') #except: #os.system('bash scripts/run_squad.sh') # A zero exit code causes the job to be marked a Succeeded. sys.exit(0) except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255)