provider "aws" { region = local.region } provider "kubernetes" { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) exec { api_version = "client.authentication.k8s.io/v1beta1" command = "aws" # This requires the awscli to be installed locally where Terraform is executed args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] } } provider "helm" { kubernetes { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) exec { api_version = "client.authentication.k8s.io/v1beta1" command = "aws" # This requires the awscli to be installed locally where Terraform is executed args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] } } } provider "kubectl" { apply_retry_count = 5 host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) load_config_file = false exec { api_version = "client.authentication.k8s.io/v1beta1" command = "aws" # This requires the awscli to be installed locally where Terraform is executed args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] } } data "aws_availability_zones" "available" {} locals { name = basename(path.cwd) region = "us-west-2" vpc_cidr = "10.0.0.0/16" azs = slice(data.aws_availability_zones.available.names, 0, 3) tags = { Blueprint = local.name GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" } } ################################################################################ # Cluster ################################################################################ #tfsec:ignore:aws-eks-enable-control-plane-logging module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.13" cluster_name = local.name cluster_version = "1.27" cluster_endpoint_public_access = true cluster_addons = { coredns = {} kube-proxy = {} vpc-cni = {} } vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets # Extend node-to-node security group rules node_security_group_additional_rules = { ingress_self_all = { description = "Node to node all ingress traffic" protocol = "-1" from_port = 0 to_port = 0 type = "ingress" self = true } egress_self_all = { description = "Node to node all egress traffic" protocol = "-1" from_port = 0 to_port = 0 type = "egress" self = true } } eks_managed_node_groups = { # For running services that do not require GPUs default = { instance_types = ["m5.large"] min_size = 1 max_size = 5 desired_size = 2 } efa = { ami_type = "AL2_x86_64_GPU" instance_types = ["g5.8xlarge"] min_size = 1 max_size = 3 desired_size = 1 subnet_ids = slice(module.vpc.private_subnets, 0, 1) network_interfaces = [ { description = "EFA interface" delete_on_termination = true device_index = 0 associate_public_ip_address = false interface_type = "efa" } ] placement = { group_name = aws_placement_group.efa.name } pre_bootstrap_user_data = <<-EOT # Install EFA curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer ./efa_installer.sh -y --minimal fi_info -p efa -t FI_EP_RDM # Disable ptrace sysctl -w kernel.yama.ptrace_scope=0 EOT taints = { dedicated = { key = "nvidia.com/gpu" value = "true" effect = "NO_SCHEDULE" } } } } tags = local.tags } ################################################################################ # EKS Blueprints Addons ################################################################################ module "eks_blueprints_addons" { source = "aws-ia/eks-blueprints-addons/aws" version = "~> 1.0" cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint cluster_version = module.eks.cluster_version oidc_provider_arn = module.eks.oidc_provider_arn # We want to wait for the Fargate profiles to be deployed first create_delay_dependencies = [for group in module.eks.eks_managed_node_groups : group.node_group_arn] enable_aws_efs_csi_driver = true enable_aws_fsx_csi_driver = true enable_kube_prometheus_stack = true kube_prometheus_stack = { values = [ <<-EOT prometheus: prometheusSpec: serviceMonitorSelectorNilUsesHelmValues: false EOT ] } enable_metrics_server = true helm_releases = { prometheus-adapter = { description = "A Helm chart for k8s prometheus adapter" namespace = "prometheus-adapter" create_namespace = true chart = "prometheus-adapter" chart_version = "4.2.0" repository = "https://prometheus-community.github.io/helm-charts" values = [ <<-EOT replicas: 2 podDisruptionBudget: enabled: true EOT ] } gpu-operator = { description = "A Helm chart for NVIDIA GPU operator" namespace = "gpu-operator" create_namespace = true chart = "gpu-operator" chart_version = "v23.3.2" repository = "https://nvidia.github.io/gpu-operator" values = [ <<-EOT operator: defaultRuntime: containerd EOT ] } } tags = local.tags } ################################################################################ # Amazon Elastic Fabric Adapter (EFA) ################################################################################ data "http" "efa_device_plugin_yaml" { url = "https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml" } resource "kubectl_manifest" "efa_device_plugin" { yaml_body = <<-YAML ${data.http.efa_device_plugin_yaml.response_body} YAML } ################################################################################ # Supporting Resources ################################################################################ module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" name = local.name cidr = local.vpc_cidr azs = local.azs private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] enable_nat_gateway = true single_nat_gateway = true public_subnet_tags = { "kubernetes.io/role/elb" = 1 } private_subnet_tags = { "kubernetes.io/role/internal-elb" = 1 } tags = local.tags } # Group instances within clustered placement group so they are in close proximity resource "aws_placement_group" "efa" { name = local.name strategy = "cluster" }