version: 0.2 env: variables: COACH_TF_TOOLKIT_VERSION: '1.0.0' COACH_TF_FRAMEWORK_VERSION: '1.12.0' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' PY_VERSION: '3' BASE_ECR_REPO: 'sagemaker-rl-coach-container' # previous images repo for layer cache, same name as pro image repo PREPROD_ECR_REPO: 'sagemaker-test' PROD_ECR_REPO: 'sagemaker-rl-coach-container' GITHUB_REPO: 'sagemaker-rl-container' BASE_IMAGE_ACCOUNT: '462105765813' # base image account/repo info for faster builds FRAMEWORK_BASE_IMAGE_ACCOUNT: '520713654638' # base image account(tf/mxnet images) required for building rl container images SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .' phases: pre_build: commands: - start-dockerd - | ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text) BASE_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$BASE_ECR_REPO" PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PREPROD_ECR_REPO" PROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PROD_ECR_REPO" # PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') # keep ssh connection alive when communicating with remote ec2 server during integ test # largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes - | echo ' ServerAliveInterval 10' >> ~/.ssh/config echo ' ServerAliveCountMax 300' >> ~/.ssh/config build: commands: # install - echo "install" - pip3 install -U -e . # Update awscli for compatibility with the latest botocore version that breaks it # https://github.com/boto/boto3/issues/2596 - pip3 install --upgrade awscli # launch remote gpu instance only in region us-west-2 - | if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then echo "launch remote gpu instance" prefix='ml.' instance_type=${GPU_INSTANCE_TYPE#"$prefix"} create-key-pair launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu else echo "skipping launch remote gpu instance" fi - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $FRAMEWORK_BASE_IMAGE_ACCOUNT) - | TF_IMAGE="$FRAMEWORK_BASE_IMAGE_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-tensorflow-scriptmode" BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" # # pull cpu base images - echo "pull cpu base images" - | COACH_TF_CPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION" docker pull $TF_IMAGE:$COACH_TF_CPU_BASE_TAG # pull gpu base images - echo "pull gpu base images" - | COACH_TF_GPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION" docker pull $TF_IMAGE:$COACH_TF_GPU_BASE_TAG # # build coach preprod cpu images - echo "build coach preprod cpu images" - | COACH_TF_CPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION" COACH_TF_CPU_TAG_BUILD_ID="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID" echo "pulling previous_image $BASE_IMAGE:$COACH_TF_CPU_TAG for layer cache..." $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker pull $BASE_IMAGE:$COACH_TF_CPU_TAG docker build --cache-from $BASE_IMAGE:$COACH_TF_CPU_TAG \ -t $PREPROD_IMAGE:$COACH_TF_CPU_TAG_BUILD_ID \ -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf \ --build-arg processor=cpu \ --build-arg region=$AWS_DEFAULT_REGION . # # push coach preprod cpu images to ecr - echo "push coach preprod cpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG_BUILD_ID # # run cpu integration tests for coach preprod cpu images - echo "run local cpu integration tests for coach preprod cpu images" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "buildspec-coach.yml"; then pytest test/integration/local \ -k "test_coach" \ --region $AWS_DEFAULT_REGION \ --docker-base-name $PREPROD_IMAGE \ --tag $COACH_TF_CPU_TAG_BUILD_ID \ --framework tensorflow \ --toolkit coach \ --processor cpu else echo "skipping local cpu integration tests" fi # build coach preprod gpu images - echo "build coach preprod gpu images" - | COACH_TF_GPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION" COACH_TF_GPU_TAG_BUILD_ID="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID" echo "pulling previous_image $BASE_IMAGE:$COACH_TF_GPU_TAG for layer cache..." $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker pull $BASE_IMAGE:$COACH_TF_GPU_TAG docker build --cache-from $BASE_IMAGE:$COACH_TF_GPU_TAG \ -t $PREPROD_IMAGE:$COACH_TF_GPU_TAG_BUILD_ID \ -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf \ --build-arg processor=gpu \ --build-arg region=$AWS_DEFAULT_REGION . # push coach preprod gpu images to ecr - echo "push coach preprod gpu images to ecr" - | $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker push $PREPROD_IMAGE:$COACH_TF_GPU_TAG_BUILD_ID # run gpu integration tests for coach preprod gpu images only in us-west-2 - echo "run local gpu integration tests for coach preprod gpu images" - | if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "buildspec-coach.yml"; then if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then printf "$SETUP_CMDS" > $SETUP_FILE cmd="pytest test/integration/local -k 'test_coach' --region $AWS_DEFAULT_REGION --toolkit coach --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_GPU_TAG_BUILD_ID --processor gpu" remote-test --github-repo $GITHUB_REPO --branch master --test-cmd "$cmd" --setup-file $SETUP_FILE fi else echo "skipping local gpu integration tests" fi # # run cpu sagemaker tests - echo "run cpu sagemaker tests for coach preprod cpu images" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-coach.yml"; then pytest test/integration/sagemaker \ -k "test_coach" \ --region $AWS_DEFAULT_REGION \ --docker-base-name $PREPROD_ECR_REPO \ --aws-id $ACCOUNT \ --tag $COACH_TF_CPU_TAG_BUILD_ID \ --framework tensorflow \ --toolkit coach \ --instance-type $CPU_INSTANCE_TYPE else echo "skipping cpu sagemaker tests" fi # run gpu sagemaker tests - echo "run gpu sagemaker tests" - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-coach.yml"; then if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then pytest test/integration/sagemaker \ -k "test_coach" \ --region $AWS_DEFAULT_REGION \ --docker-base-name $PREPROD_ECR_REPO \ --aws-id $ACCOUNT \ --tag $COACH_TF_GPU_TAG_BUILD_ID \ --framework tensorflow \ --toolkit coach \ --instance-type $GPU_INSTANCE_TYPE fi else echo "skipping gpu sagemaker tests" fi # publish cpu and gpu image to prod ecr repo if this is release build - | if is-release-build; then $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) docker tag $PREPROD_IMAGE:$COACH_TF_CPU_TAG_BUILD_ID $PROD_IMAGE:$COACH_TF_CPU_TAG docker push $PROD_IMAGE:$COACH_TF_CPU_TAG docker tag $PREPROD_IMAGE:$COACH_TF_GPU_TAG_BUILD_ID $PROD_IMAGE:$COACH_TF_GPU_TAG docker push $PROD_IMAGE:$COACH_TF_GPU_TAG else echo "skipping publishing new image to production repo" fi finally: # only shut down remote gpu instance if in us-west-2 - | if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then echo "cleanup remote gpu instance" cleanup-gpu-instances cleanup-key-pairs else echo "No remote gpu instance to cleanup" fi # remove ecr image - | aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_CPU_TAG_BUILD_ID aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_GPU_TAG_BUILD_ID