cluster_name: cluster 

max_workers: 2

upscaling_speed: 1.0

docker:
    image: "rayproject/ray-ml:latest-gpu"
    container_name: "ray_nvidia_docker"

idle_timeout_minutes: 5

provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: False 
    cloudwatch:
        agent:
            config: "cloudwatch/cloudwatch-agent-config.json"
        dashboard:
            name: "RayDashboard"
            config: "cloudwatch/cloudwatch-dashboard-config.json"

auth:
    ssh_user: ubuntu

available_node_types:
    ray.head.default:
        node_config:
            InstanceType: r5dn.4xlarge 
            ImageId: ami-0a2363a9cff180a64 # us-west-2 DL AMI Ubuntu Version 30
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 500

    ray.worker.default:
        docker:
            worker_image: "rayproject/ray-ml:latest-gpu"
        min_workers: 2 
        max_workers: 2
        resources: {}
        node_config:
            InstanceType: p2.xlarge
            ImageId: ami-0a2363a9cff180a64 # us-west-2 DL AMI Ubuntu Version 30
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 500

head_node_type: ray.head.default

setup_commands: 
    - pip install -U torch 
    - conda install -y cudatoolkit

head_setup_commands:
    - pip install -U jupyterlab mlflow boto3 
    - nohup mlflow ui --host 0.0.0.0 --port 5001 > mlflow.out &
    - nohup jupyter lab > jupyterlab.out &

worker_setup_commands: []

head_start_ray_commands:
    - ray stop
    - export AUTOSCALER_MAX_NUM_FAILURES=inf; ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076