compute_environment: LOCAL_MACHINE deepspeed_config: gradient_accumulation_steps: 4 gradient_clipping: 1.0 offload_optimizer_device: none offload_param_device: none zero3_init_flag: true zero3_save_16bit_model: true zero_stage: 3 distributed_type: DEEPSPEED downcast_bf16: 'yes' dynamo_config: {} fsdp_config: {} machine_rank: 0 main_training_function: main megatron_lm_config: {} num_machines: 1 num_processes: 4 rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false