version: 0.2 env: variables: COACH_MXNET_TOOLKIT_VERSION: '0.11.0' COACH_MXNET_FRAMEWORK_VERSION: '1.3.0' COACH_TF_TOOLKIT_VERSION: '0.11.1' COACH_TF_FRAMEWORK_VERSION: '1.12.0' RAY_TF_TOOKIT_VERSION: '0.6.5' RAY_TF_FRAMEWORK_VERSION: '1.12.0' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' PY_VERSION: '3' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-rl-container' PROD_ACCOUNT: '520713654638' SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .' phases: pre_build: commands: - start-dockerd - | ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text) PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') # keep ssh connection alive when communicating with remote ec2 server during integ test # largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes - | echo ' ServerAliveInterval 10' >> ~/.ssh/config echo ' ServerAliveCountMax 300' >> ~/.ssh/config build: commands: # install - echo "install" - pip3 install -U -e . # Update awscli for compatibility with the latest botocore version that breaks it # https://github.com/boto/boto3/issues/2596 - pip3 install --upgrade awscli # launch remote gpu instance - echo "launch remote gpu instance" - | prefix='ml.' instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - create-key-pair - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $PROD_ACCOUNT) - | MXNET_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-mxnet" TF_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-tensorflow-scriptmode" BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" # pull cpu base images - echo "pull cpu base images" - | COACH_MXNET_CPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-cpu-py$PY_VERSION" docker pull $MXNET_IMAGE:$COACH_MXNET_CPU_BASE_TAG COACH_TF_CPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION" docker pull $TF_IMAGE:$COACH_TF_CPU_BASE_TAG if [ "$RAY_TF_FRAMEWORK_VERSION" != "$COACH_TF_FRAMEWORK_VERSION" ]; then RAY_TF_CPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION" docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG fi # pull gpu base images - echo "pull gpu base images" - | COACH_MXNET_GPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-gpu-py$PY_VERSION" docker pull $MXNET_IMAGE:$COACH_MXNET_GPU_BASE_TAG COACH_TF_GPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION" docker pull $TF_IMAGE:$COACH_TF_GPU_BASE_TAG if [ "$RAY_TF_FRAMEWORK_VERSION" != "$COACH_TF_FRAMEWORK_VERSION" ]; then RAY_TF_GPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION" docker pull $TF_IMAGE:$RAY_TF_GPU_BASE_TAG fi # build cpu images - echo "build cpu images" - | COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=cpu . COACH_TF_CPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_TF_CPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=cpu . RAY_TF_CPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=cpu . # push cpu images to ecr - echo "push cpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG # run cpu integration tests - echo "run cpu integration tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --processor cpu pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --processor cpu pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --processor cpu else echo "skipping cpu integration tests" fi # build gpu images - echo "build gpu images" - | COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=gpu . COACH_TF_GPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$COACH_TF_GPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=gpu . RAY_TF_GPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID" docker build -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=gpu . # push gpu images to ecr - echo "push gpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG docker push $PREPROD_IMAGE:$COACH_TF_GPU_TAG docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG # run gpu integration tests - echo "run gpu integration tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then printf "$SETUP_CMDS" > $SETUP_FILE cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework mxnet --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number $PR_NUM cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --pr-number $PR_NUM --skip-setup cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit ray --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --pr-number $PR_NUM --skip-setup else echo "skipping coach gpu integration tests" fi # run cpu sagemaker tests - echo "run cpu sagemaker tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --instance-type $CPU_INSTANCE_TYPE pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --instance-type $CPU_INSTANCE_TYPE pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --instance-type $CPU_INSTANCE_TYPE else echo "skipping cpu sagemaker tests" fi # run gpu sagemaker tests - echo "run gpu sagemaker tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_GPU_TAG --framework mxnet --toolkit coach --instance-type $GPU_INSTANCE_TYPE pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_GPU_TAG --framework tensorflow --toolkit coach --instance-type $GPU_INSTANCE_TYPE pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_GPU_TAG --framework tensorflow --toolkit ray --instance-type $GPU_INSTANCE_TYPE else echo "skipping gpu sagemaker tests" fi finally: # shut down remote gpu instance - cleanup-gpu-instances - cleanup-key-pairs # remove ecr image - | aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_MXNET_CPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_CPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_CPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_MXNET_GPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_GPU_TAG aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG