AWSTemplateFormatVersion: '2010-09-09' Description: Creates environment for running Amazon DSSTNE on EC2 Container Service Mappings: AmazonLinuxAMI: ap-northeast-1: {AMI: ami-78ba6619} ap-southeast-1: {AMI: ami-56e84c35} ap-southeast-2: {AMI: ami-2589b946} eu-central-1: {AMI: ami-d9d62ab6} eu-west-1: {AMI: ami-b9bd25ca} us-east-1: {AMI: ami-50b4f047} us-west-1: {AMI: ami-699ad409} us-west-2: {AMI: ami-002bf460} Metadata: AWS::CloudFormation::Interface: ParameterGroups: - Label: {default: SSH Access} Parameters: [KeyName, SSHLocation] - Label: {default: 'VPC, Subnets, and Instance'} Parameters: [VpcId, Subnets, ECSInstanceType] - Label: {default: Data and Config} Parameters: [DataUrl, TrainingConfigUrl] ParameterLabels: DataUrl: {default: Data URL} ECSInstanceType: {default: ECS instance type} KeyName: {default: Key name} SSHLocation: {default: SSH CIDR range} Subnets: {default: Subnets} TrainingConfigUrl: {default: Training config URL} VpcId: {default: VPC} Maintainer: {Description: Chad Schmutzer } Outputs: CloudWatchLogsGroup: Description: Name of the CloudWatch Logs Group Value: {Ref: CloudWatchLogsGroup} DsstneTaskDefinition: Description: The task definition for Amazon DSSTNE Value: {Ref: ECSTaskDefinitionDsstne} ECRRepository: Description: The ECR Repository for the Amazon DSSTNE container Value: {Ref: ECRRepository} ECSClusterName: Description: Name of the ECS cluster Value: {Ref: ECSCluster} S3BucketResults: Description: The S3 bucket for storing results Value: {Ref: S3BucketResults} Parameters: DataUrl: {ConstraintDescription: Must be valid URL, Default: 'https://s3-us-west-2.amazonaws.com/amazon-dsstne-samples/data/ml20m-all', Description: Neural Network Modeling data URL, Type: String} ECSInstanceType: AllowedValues: [g2.2xlarge, g2.8xlarge, p2.xlarge, p2.8xlarge, p2.16xlarge] ConstraintDescription: Must be g2.2xlarge, g2.8xlarge, p2.xlarge, p2.8xlarge, p2.16xlarge Default: g2.2xlarge Description: ECS instance type Type: String KeyName: {Description: Name of an existing EC2 KeyPair, Type: 'AWS::EC2::KeyPair::KeyName'} SSHLocation: {AllowedPattern: '(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})/(\d{1,2})', ConstraintDescription: Must be a valid CIDR range of the form x.x.x.x/x, Default: 0.0.0.0/0, Description: Restrict SSH access to CIDR range (default can be accessed from anywhere), MaxLength: '18', MinLength: '9', Type: String} Subnets: {ConstraintDescription: Must be a list of existing subnets in the selected Virtual Private Cloud, Description: Please make sure you select 1 or more subnets in the Virtual Private Cloud (VPC) ID you chose above, Type: 'List'} TrainingConfigUrl: {ConstraintDescription: Must be a valid URL, Default: 'https://s3-us-west-2.amazonaws.com/amazon-dsstne-samples/configs/config.json', Description: Training config URL, Type: String} VpcId: {ConstraintDescription: Must be the VPC ID of an existing Virtual Private Cloud, Description: VPC ID of your existing Virtual Private Cloud (VPC), Type: 'AWS::EC2::VPC::Id'} Resources: CloudWatchLogsGroup: Properties: {RetentionInDays: 7} Type: AWS::Logs::LogGroup ECRRepository: {Type: 'AWS::ECR::Repository'} ECSCluster: {Type: 'AWS::ECS::Cluster'} ECSInstanceAutoScalingGroup: CreationPolicy: ResourceSignal: {Count: '1', Timeout: PT120M} Properties: DesiredCapacity: '1' LaunchConfigurationName: {Ref: ECSInstanceLaunchConfiguration} MaxSize: '1' MinSize: '1' Tags: - Key: Name PropagateAtLaunch: 'true' Value: {Ref: 'AWS::StackName'} VPCZoneIdentifier: {Ref: Subnets} Type: AWS::AutoScaling::AutoScalingGroup ECSInstanceLaunchConfiguration: Metadata: AWS::CloudFormation::Init: config: files: /etc/ecs/ecs.config: content: Fn::Join: - '' - - ECS_CLUSTER= - {Ref: ECSCluster} - ' ' - 'ECS_AVAILABLE_LOGGING_DRIVERS=["json-file","awslogs"] ' group: root mode: '000644' owner: root /home/ec2-user/.aws/config: content: Fn::Join: - '' - - '[default] ' - 'region = ' - {Ref: 'AWS::Region'} - ' ' group: root mode: '000644' owner: ec2-user /home/ec2-user/Dockerfile: content: Fn::Join: - '' - ['FROM nvidia/cuda:7.5-cudnn5-devel ', ' ', 'RUN apt-get update && \ ', ' apt-get -y dist-upgrade && \ ', ' apt-get -y install \ ', ' kmod \ ', ' make \ ', ' build-essential \ ', ' cmake \ ', ' cpp \ ', ' g++ \ ', ' gcc \ ', ' libatlas-base-dev \ ', ' curl \ ', ' python-pip \ ', ' openmpi-bin \ ', ' libopenmpi-dev \ ', ' libjsoncpp-dev \ ', ' libhdf5-dev \ ', ' openssh-client \ ', ' zlib1g-dev ', ' ', 'RUN pip install awscli ', ' ', 'ENV GPU_DRIVER_VERSION=352.99 ', ' ', 'RUN cd /tmp && \ ', ' curl -LO http://us.download.nvidia.com/XFree86/Linux-x86_64/$GPU_DRIVER_VERSION/NVIDIA-Linux-x86_64-$GPU_DRIVER_VERSION.run && \ ', ' chmod +x ./NVIDIA-Linux-x86_64-$GPU_DRIVER_VERSION.run && \ ', ' ./NVIDIA-Linux-x86_64-$GPU_DRIVER_VERSION.run -s --no-kernel-module && \ ', ' rm -rf /tmp/* ', ' ', 'RUN ln -s /usr/include/hdf5/serial/* /usr/include/ ', ' ', 'RUN ln -s /usr/lib/x86_64-linux-gnu/hdf5/serial/libhdf5* /usr/lib/x86_64-linux-gnu/ ', ' ', 'RUN cd /tmp && \ ', ' curl -LO ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.1.3.tar.gz && \ ', ' tar xvf netcdf-4.1.3.tar.gz && \ ', ' cd netcdf-4.1.3 && \ ', ' ./configure --prefix=/usr/local && \ ', ' make -j16 && \ ', ' make install && rm -rf /tmp/* ', ' ', 'RUN cd /tmp && \ ', ' curl -LO http://www.unidata.ucar.edu/downloads/netcdf/ftp/netcdf-cxx4-4.2.tar.gz && \ ', ' tar xvf netcdf-cxx4-4.2.tar.gz && \ ', ' cd netcdf-cxx4-4.2 && \ ', ' ./configure --prefix=/usr/local && \ ', ' make -j16 && \ ', ' make install && rm -rf /tmp/* ', ' ', 'RUN cd /tmp && \ ', ' curl -LO https://github.com/NVlabs/cub/archive/1.5.2.zip && \ ', ' apt-get install -y unzip && \ ', ' unzip 1.5.2.zip && \ ', ' cp -rf cub-1.5.2/cub/ /usr/local/include/ && \ ', ' rm -rf /tmp/* ', ' ', 'ENV PATH=/usr/local/openmpi/bin/:/usr/local/cuda/bin/:${PATH} \ ', ' LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH} ', ' ', 'RUN ln -sf /usr/lib/openmpi /usr/local/openmpi ', ' ', 'COPY src /opt/amazon/dsstne/src ', ' ', 'RUN sed -i "s/-gencode arch=compute_60,code=sm_60//" /opt/amazon/dsstne/src/amazon/dsstne/Makefile.inc ', ' ', 'RUN cd /opt/amazon/dsstne/src/amazon/dsstne && \ ', ' make ', ' ', 'ENV PATH /opt/amazon/dsstne/src/amazon/dsstne/bin:${PATH} '] group: root mode: '000644' owner: root /root/.aws/config: content: Fn::Join: - '' - - '[default] ' - 'region = ' - {Ref: 'AWS::Region'} - ' ' group: root mode: '000644' owner: root packages: yum: docker: [] ecs-init: [] git: [] services: sysvinit: docker: {enabled: 'true', ensureRunning: 'true'} Properties: BlockDeviceMappings: - DeviceName: /dev/xvda Ebs: {VolumeSize: '30'} IamInstanceProfile: {Ref: ECSInstanceProfile} ImageId: Fn::FindInMap: - AmazonLinuxAMI - {Ref: 'AWS::Region'} - AMI InstanceType: {Ref: ECSInstanceType} KeyName: {Ref: KeyName} SecurityGroups: - {Ref: ECSInstanceSecurityGroup} UserData: Fn::Base64: Fn::Join: - '' - - '#!/bin/bash -xe ' - 'yum -y update --exclude=kernel* --exclude=nvidia* ' - '# Install the files and packages from the metadata ' - '/opt/aws/bin/cfn-init -v ' - ' --stack ' - {Ref: 'AWS::StackName'} - ' --resource ECSInstanceLaunchConfiguration ' - ' --region ' - {Ref: 'AWS::Region'} - ' ' - '/sbin/start ecs ' - 'cd /root/ && git clone https://github.com/amznlabs/amazon-dsstne.git ' - 'cd /root/amazon-dsstne/ ' - 'cp /home/ec2-user/Dockerfile . ' - 'docker build -t ' - {Ref: ECRRepository} - ' . ' - 'aws ecr get-login | sh ' - 'docker tag ' - {Ref: ECRRepository} - ':latest ' - {Ref: 'AWS::AccountId'} - .dkr.ecr. - {Ref: 'AWS::Region'} - .amazonaws.com/ - {Ref: ECRRepository} - ':latest ' - 'docker push ' - {Ref: 'AWS::AccountId'} - .dkr.ecr. - {Ref: 'AWS::Region'} - .amazonaws.com/ - {Ref: ECRRepository} - ':latest ' - '# Signal the status from cfn-init ' - '/opt/aws/bin/cfn-signal -e $? ' - ' --stack ' - {Ref: 'AWS::StackName'} - ' --resource ECSInstanceAutoScalingGroup ' - ' --region ' - {Ref: 'AWS::Region'} - ' ' Type: AWS::AutoScaling::LaunchConfiguration ECSInstanceProfile: DependsOn: ECSInstanceRole Properties: Path: / Roles: - {Ref: ECSInstanceRole} Type: AWS::IAM::InstanceProfile ECSInstanceRole: Properties: AssumeRolePolicyDocument: Statement: - Action: ['sts:AssumeRole'] Effect: Allow Principal: Service: [ec2.amazonaws.com] Version: '2012-10-17' ManagedPolicyArns: ['arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role'] Path: / Policies: - PolicyDocument: Statement: - Action: s3:ListBucket Effect: Allow Resource: Fn::Join: - '' - - 'arn:aws:s3:::' - {Ref: S3BucketResults} - Action: ['s3:PutObject', 's3:GetObject', 's3:DeleteObject'] Effect: Allow Resource: Fn::Join: - '' - - 'arn:aws:s3:::' - {Ref: S3BucketResults} - /* - Action: ['ecr:DescribeRepositories', 'ecr:ListImages', 'ecr:InitiateLayerUpload', 'ecr:UploadLayerPart', 'ecr:CompleteLayerUpload', 'ecr:PutImage'] Effect: Allow Resource: Fn::Join: - '' - - 'arn:aws:ecr:' - {Ref: 'AWS::Region'} - ':' - {Ref: 'AWS::AccountId'} - :repository/ - {Ref: ECRRepository} Version: '2012-10-17' PolicyName: Amazon-DSSTNE Type: AWS::IAM::Role ECSInstanceSecurityGroup: Properties: GroupDescription: Security Group for ECSInstance SecurityGroupIngress: - CidrIp: {Ref: SSHLocation} FromPort: '22' IpProtocol: tcp ToPort: '22' Tags: - {Key: Name, Value: ECSInstanceSecurityGroup} VpcId: {Ref: VpcId} Type: AWS::EC2::SecurityGroup ECSServiceRole: Properties: AssumeRolePolicyDocument: Statement: - Action: ['sts:AssumeRole'] Effect: Allow Principal: Service: [ecs.amazonaws.com] Version: '2012-10-17' ManagedPolicyArns: ['arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceRole'] Path: / Type: AWS::IAM::Role ECSTaskDefinitionDsstne: Properties: ContainerDefinitions: - Command: ['curl -Lo data $DATAURL && echo "running $GENERATEINPUTCMD" && $GENERATEINPUTCMD && echo "running $GENERATEOUTPUTCMD" && $GENERATEOUTPUTCMD && curl -Lo config $TRAININGCONFIGURL && echo "running $TRAINCMD" && $TRAINCMD && echo "running $PREDICTCMD" && $PREDICTCMD && DATE=`date -Iseconds` && echo "results being written to s3://$S3BUCKETRESULTS/results.$HOSTNAME.$DATE" && aws s3 cp results s3://$S3BUCKETRESULTS/results.$HOSTNAME.$DATE && echo "Task complete!"'] EntryPoint: [/bin/bash, -c] Environment: - Name: DATAURL Value: {Ref: DataUrl} - Name: TRAININGCONFIGURL Value: {Ref: TrainingConfigUrl} - Name: S3BUCKETRESULTS Value: {Ref: S3BucketResults} - {Name: GENERATEINPUTCMD, Value: generateNetCDF -d gl_input -i data -o gl_input.nc -f features_input -s samples_input -c} - {Name: GENERATEOUTPUTCMD, Value: generateNetCDF -d gl_output -i data -o gl_output.nc -f features_output -s samples_input -c} - {Name: TRAINCMD, Value: train -c config -i gl_input.nc -o gl_output.nc -n gl.nc -b 256 -e 10} - {Name: PREDICTCMD, Value: predict -b 1024 -d gl -i features_input -o features_output -k 10 -n gl.nc -f data -s results -r data} Image: Fn::Join: - '' - - {Ref: 'AWS::AccountId'} - .dkr.ecr. - {Ref: 'AWS::Region'} - .amazonaws.com/ - {Ref: ECRRepository} - :latest LogConfiguration: LogDriver: awslogs Options: awslogs-group: {Ref: CloudWatchLogsGroup} awslogs-region: {Ref: 'AWS::Region'} Memory: '15000' Name: Amazon-DSSTNE Privileged: 'true' Type: AWS::ECS::TaskDefinition S3BucketResults: Properties: {AccessControl: BucketOwnerFullControl} Type: AWS::S3::Bucket