# *************************************************************************************** # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * # * # Permission is hereby granted, free of charge, to any person obtaining a copy of this * # software and associated documentation files (the "Software"), to deal in the Software * # without restriction, including without limitation the rights to use, copy, modify, * # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * # permit persons to whom the Software is furnished to do so. * # * # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * # *************************************************************************************** import os import json import socket if __name__ == "__main__": hosts = json.loads(os.environ['SM_HOSTS']) current_host = os.environ['SM_CURRENT_HOST'] host_rank = int(hosts.index(current_host)) #Parse the IP address of the master node in the multiple nodes cluster of SageMaker training. master = json.loads(os.environ['SM_TRAINING_ENV'])['master_hostname'] master_addr = socket.gethostbyname(master) os.environ['NODE_INDEX'] = str(host_rank) os.environ['SM_MASTER'] = str(master) os.environ['SM_MASTER_ADDR'] = str(master_addr) os.environ['NCCL_SOCKET_IFNAME'] = 'eth0' #invoke the torch launcher shell script. #Note: we will use the pytorch launcher to launch deepspeed for multi-nodes training. #Note: we will use the s5cmd to speed up the uploading model assets to S3. os.system("chmod +x ./T5_configz_and_code/scripts/torch_launch.sh") os.system("chmod +x ./T5_configz_and_code/scripts/s5cmd") os.system("/bin/bash -c ./T5_configz_and_code/scripts/torch_launch.sh")