#!/bin/bash #Please change the folder "T5_configz_and_code" to your folder which includes config files and main codes. WORKING_DIR=/opt/ml/code/T5_configz_and_code SM_WORKING_DIR=/opt/ml/model #The related information about multi-nodes cluster. MASTER_HOST=$SM_MASTER MASTER_ADDR=$SM_MASTER_ADDR MASTER_PORT="23456" NNODES="$NODE_NUMBER" NODE_RANK="$NODE_INDEX" #Configure the distributed arguments for torch.distributed.launch. GPUS_PER_NODE="$SM_NUM_GPUS" DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ --nnodes $NNODES --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ --master_port $MASTER_PORT" SAVE_PATH="${SM_WORKING_DIR}/results" LOG_FILE="${SAVE_PATH}/log.txt" #Set the path of your deepspeed config file. DS_CONFIG="${WORKING_DIR}/configs/ds_flan_t5_z3_config_bf16.json" #Configure the parameters for your training according to your model and dataset. #Note: you should set the corresponding paths of train_dataset_path and test_dataset_path according to your input data channel name. EPOCHS=1 model_id="google/flan-t5-xxl" train_dataset_path='/opt/ml/input/data/training' test_dataset_path='/opt/ml/input/data/test' learning_rate=0.0001 generation_max_length=150 per_device_train_batch_size=1 per_device_eval_batch_size=8 OPTS="" OPTS+=" --per_device_eval_batch_size ${per_device_eval_batch_size}" OPTS+=" --per_device_train_batch_size ${per_device_train_batch_size}" OPTS+=" --generation_max_length ${generation_max_length}" OPTS+=" --test_dataset_path ${test_dataset_path}" OPTS+=" --model_id ${model_id}" OPTS+=" --train_dataset_path ${train_dataset_path}" OPTS+=" --distributed-backend nccl" OPTS+=" --learning_rate ${learning_rate}" OPTS+=" --deepspeed" OPTS+=" --deepspeed_config ${DS_CONFIG}" OPTS+=" --epochs ${EPOCHS}" CMD="python -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${WORKING_DIR}/scripts/run_seq2seq_deepspeed.py ${OPTS}" echo ${CMD} mkdir -p ${SAVE_PATH} ${CMD} 2>&1 | tee ${SAVE_PATH}/train_log