cluster_name: cluster max_workers: 2 upscaling_speed: 1.0 docker: image: "rayproject/ray-ml:latest-gpu" container_name: "ray_nvidia_docker" idle_timeout_minutes: 5 provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: False cloudwatch: agent: config: "cloudwatch/cloudwatch-agent-config.json" dashboard: name: "RayDashboard" config: "cloudwatch/cloudwatch-dashboard-config.json" auth: ssh_user: ubuntu available_node_types: ray.head.default: node_config: InstanceType: r5dn.4xlarge ImageId: ami-0a2363a9cff180a64 # us-west-2 DL AMI Ubuntu Version 30 BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: VolumeSize: 500 ray.worker.default: docker: worker_image: "rayproject/ray-ml:latest-gpu" min_workers: 2 max_workers: 2 resources: {} node_config: InstanceType: p2.xlarge ImageId: ami-0a2363a9cff180a64 # us-west-2 DL AMI Ubuntu Version 30 BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: VolumeSize: 500 head_node_type: ray.head.default setup_commands: - pip install -U torch - conda install -y cudatoolkit head_setup_commands: - pip install -U jupyterlab mlflow boto3 - nohup mlflow ui --host 0.0.0.0 --port 5001 > mlflow.out & - nohup jupyter lab > jupyterlab.out & worker_setup_commands: [] head_start_ray_commands: - ray stop - export AUTOSCALER_MAX_NUM_FAILURES=inf; ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml worker_start_ray_commands: - ray stop - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076