import sys,os,logging import json import argparse import subprocess import traceback def _parse_args(): parser = argparse.ArgumentParser() # Data, model, and output directories # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket. parser.add_argument('--model_dir', type=str) parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR')) parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING')) parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS'))) parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST')) return parser.parse_known_args() if __name__ == "__main__": args, unknown = _parse_args() module_path = os.path.abspath('audio_event_detection/scripts/training') if module_path not in sys.path: sys.path.append(module_path) try: subprocess.run(["python", os.path.join(module_path, "train.py")]) print('Training complete.') # A zero exit code causes the job to be marked a Succeeded. sys.exit(0) except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() with open(os.path.join('/opt/ml/output', 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255)