--- kind: Service apiVersion: v1 metadata: name: ${instance_name} namespace: ${namespace} labels: app: ${instance_name} spec: ports: - name: model-server port: ${service_port} targetPort: pod-port selector: app: ${instance_name} role: master type: ClusterIP --- kind: Deployment apiVersion: apps/v1 metadata: name: ${instance_name} namespace: ${namespace} labels: app: ${instance_name} role: master spec: replicas: 1 # Number of desired replicas. Increase to desired number. selector: matchLabels: app: ${instance_name} role: master template: metadata: labels: app: ${instance_name} role: master spec: nodeSelector: node.kubernetes.io/instance-type: "${instance_type}" containers: - name: main image: "${registry}${model_image_name}${model_image_tag}" env: - name: NUM_MODELS value: "${num_models}" - name: POSTPROCESS value: "${postprocess}" - name: QUIET value: "${quiet}" imagePullPolicy: Always ports: - name: pod-port containerPort: 8080 securityContext: capabilities: add: - IPC_LOCK resources: limits: #hugepages-2Mi: 256Mi # configure to 256 * desired number of Inferentia devices. aws.amazon.com/neuron: 1 # desired number of Inferentia devices. #requests: #memory: 1024Mi # Desired amount of memory. Should be larger than hugepages-2Mi limit.