import os import re import subprocess import botocore import boto3 import json import time from packaging.version import Version from packaging.specifiers import SpecifierSet import pytest import requests from urllib3.util.retry import Retry from invoke.context import Context from botocore.exceptions import ClientError from src.buildspec import Buildspec from test.test_utils import ( LOGGER, CONTAINER_TESTS_PREFIX, ec2, get_container_name, get_framework_and_version_from_tag, get_neuron_sdk_version_from_tag, get_neuron_release_manifest, is_canary_context, is_dlc_cicd_context, run_cmd_on_container, start_container, stop_and_remove_container, get_repository_local_path, get_repository_and_tag_from_image_uri, get_python_version_from_image_uri, get_cuda_version_from_tag, get_labels_from_ecr_image, construct_buildspec_path, is_tf_version, is_nightly_context, get_processor_from_image_uri, execute_env_variables_test, UL20_CPU_ARM64_US_WEST_2, UBUNTU_18_HPU_DLAMI_US_WEST_2, NEURON_UBUNTU_18_BASE_DLAMI_US_WEST_2, ) @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") @pytest.mark.canary("Run stray file test regularly on production images") def test_stray_files(image): """ Test to ensure that unnecessary build artifacts are not present in any easily visible or tmp directories :param image: ECR image URI """ ctx = Context() container_name = get_container_name("test_tmp_dirs", image) start_container(container_name, image, ctx) # Running list of artifacts/artifact regular expressions we do not want in any of the directories stray_artifacts = [r"\.py"] # Running list of allowed files in the /tmp directory allowed_tmp_files = ["hsperfdata_root"] # Ensure stray artifacts are not in the tmp directory tmp = run_cmd_on_container(container_name, ctx, "ls -A /tmp") _assert_artifact_free(tmp, stray_artifacts) # Ensure tmp dir is empty except for whitelisted files tmp_files = tmp.stdout.split() for tmp_file in tmp_files: assert ( tmp_file in allowed_tmp_files ), f"Found unexpected file in tmp dir: {tmp_file}. Allowed tmp files: {allowed_tmp_files}" # We always expect /var/tmp to be empty var_tmp = run_cmd_on_container(container_name, ctx, "ls -A /var/tmp") _assert_artifact_free(var_tmp, stray_artifacts) assert var_tmp.stdout.strip() == "" # Additional check of home and root directories to ensure that stray artifacts are not present home = run_cmd_on_container(container_name, ctx, "ls -A ~") _assert_artifact_free(home, stray_artifacts) root = run_cmd_on_container(container_name, ctx, "ls -A /") _assert_artifact_free(root, stray_artifacts) @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") @pytest.mark.canary("Run python version test regularly on production images") def test_python_version(image): """ Check that the python version in the image tag is the same as the one on a running container. :param image: ECR image URI """ ctx = Context() container_name = get_container_name("py-version", image) py_version = "" for tag_split in image.split("-"): if tag_split.startswith("py"): if len(tag_split) > 3: py_version = f"Python {tag_split[2]}.{tag_split[3]}" else: py_version = f"Python {tag_split[2]}" start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, "python --version") # Due to py2 deprecation, Python2 version gets streamed to stderr. Python installed via Conda also appears to # stream to stderr (in some cases). container_py_version = output.stdout + output.stderr assert py_version in container_py_version, f"Cannot find {py_version} in {container_py_version}" @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") def test_ubuntu_version(image): """ Check that the ubuntu version in the image tag is the same as the one on a running container. :param image: ECR image URI """ ctx = Context() container_name = get_container_name("ubuntu-version", image) ubuntu_version = "" for tag_split in image.split("-"): if tag_split.startswith("ubuntu"): ubuntu_version = tag_split.split("ubuntu")[-1] start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, "cat /etc/os-release") container_ubuntu_version = output.stdout assert "Ubuntu" in container_ubuntu_version assert ubuntu_version in container_ubuntu_version @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") @pytest.mark.canary("Run non-gpu tf serving version test regularly on production images") def test_tf_serving_version_cpu(tensorflow_inference): """ For non-huggingface non-GPU TF inference images, check that the tag version matches the version of TF serving in the container. Huggingface includes MMS and core TF, hence the versioning scheme is based off of the underlying tensorflow framework version, rather than the TF serving version. GPU inference images will be tested along side `test_framework_and_cuda_version_gpu` in order to be judicious about GPU resources. This test can run directly on the host, and thus does not require additional resources to be spun up. @param tensorflow_inference: ECR image URI """ # Set local variable to clarify contents of fixture image = tensorflow_inference if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) if "neuron" in image: pytest.skip( "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version" ) _, tag_framework_version = get_framework_and_version_from_tag(image) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference", image_repo_name) and Version( tag_framework_version ) == Version("2.6.3"): pytest.skip( "Skipping this test for TF 2.6.3 inference as the v2.6.3 version is already on production" ) ctx = Context() container_name = get_container_name("tf-serving-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, "tensorflow_model_server --version", executable="bash" ) assert re.match( rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output.stdout ), f"Cannot find model server version {tag_framework_version} in {output.stdout}" stop_and_remove_container(container_name, ctx) @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") def test_tf_serving_api_version(tensorflow_inference): """ For non-huggingface TF inference images, check that the tag version matches the version of TF serving api in the container. Huggingface includes MMS and core TF, hence the versioning scheme is based off of the underlying tensorflow framework version, rather than the TF serving version. @param tensorflow_inference: ECR image URI """ # Set local variable to clarify contents of fixture image = tensorflow_inference if "gpu" in image: cmd = "pip show tensorflow-serving-api-gpu | grep Version" elif "cpu" in image: cmd = "pip show tensorflow-serving-api | grep Version" else: ValueError( "Test as of now only covers CPU and GPU type images. If required, please modify this test to accommodate the new image type!" ) _, tag_framework_version = get_framework_and_version_from_tag(image) ctx = Context() container_name = get_container_name("tf-serving-api-version", image) start_container(container_name, image, ctx) try: output = run_cmd_on_container(container_name, ctx, cmd, executable="bash") str_version_from_output = ((str(output.stdout).split(" "))[1]).strip() assert ( tag_framework_version == str_version_from_output ), f"Tensorflow serving API version is {str_version_from_output} while the Tensorflow version is {tag_framework_version}. Both don't match!" except Exception as e: LOGGER.error(f"Unable to execute command on container. Error: {e}") raise finally: stop_and_remove_container(container_name, ctx) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_sm_toolkit_and_ts_version_pytorch(pytorch_inference, region): _test_sm_toolkit_and_ts_version(pytorch_inference, region) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_sm_toolkit_and_ts_version_pytorch_graviton(pytorch_inference_graviton, region): _test_sm_toolkit_and_ts_version(pytorch_inference_graviton, region) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_sm_toolkit_and_ts_version_pytorch_neuron(pytorch_inference_neuron, region): _test_sm_toolkit_and_ts_version(pytorch_inference_neuron, region) @pytest.mark.usefixtures("sagemaker", "huggingface") @pytest.mark.model("N/A") @pytest.mark.canary("Run non-gpu framework version test regularly on production images") def test_framework_version_cpu(image): """ Check that the framework version in the image tag is the same as the one on a running container. This function tests CPU, EIA images. :param image: ECR image URI """ if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) if "neuron" in image: pytest.skip( "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version" ) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference(-eia|-graviton)?", image_repo_name): pytest.skip( "Non-gpu tensorflow-inference images will be tested in test_tf_serving_version_cpu." ) tested_framework, tag_framework_version = get_framework_and_version_from_tag(image) # Framework name may include huggingface if any([tested_framework.startswith(prefix) for prefix in ["huggingface_", "stabilityai_"]]): # Remove the prefix till first underscore tested_framework = "_".join(tested_framework.split("_")[1:]) # Module name is torch if tested_framework == "pytorch": tested_framework = "torch" elif tested_framework == "autogluon": tested_framework = "autogluon.core" ctx = Context() container_name = get_container_name("framework-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.__version__)", executable="python", ).stdout.strip() if is_canary_context(): assert tag_framework_version in output else: if tested_framework == "autogluon.core": versions_map = { # container version -> autogluon version # '0.3.2': '0.3.1', } version_to_check = versions_map.get(tag_framework_version, tag_framework_version) assert output.startswith(version_to_check) # Habana v1.2 binary does not follow the X.Y.Z+cpu naming convention elif "habana" not in image_repo_name: if tested_framework == "torch" and Version(tag_framework_version) >= Version("1.10.0"): if is_nightly_context(): torch_version_pattern = r"{torch_version}(\+cpu|\.dev\d+)".format( torch_version=tag_framework_version ) assert re.fullmatch(torch_version_pattern, output), ( f"torch.__version__ = {output} does not match {torch_version_pattern}\n" f"Please specify nightly framework version as X.Y.Z.devYYYYMMDD" ) else: if ( Version(tag_framework_version) >= Version("2.0.0") and "training" in image_repo_name ): cuda_output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.version.cuda)", executable="python", ).stdout.strip() torch_version_pattern = r"{torch_version}".format( torch_version=tag_framework_version ) assert cuda_output == "None", f"cuda version has value: {cuda_output}" else: torch_version_pattern = r"{torch_version}(\+cpu)".format( torch_version=tag_framework_version ) assert re.fullmatch(torch_version_pattern, output), ( f"torch.__version__ = {output} does not match {torch_version_pattern}\n" f"Please specify framework version as X.Y.Z+cpu" ) else: if "neuron" in image: assert tag_framework_version in output if all(_string in image for _string in ["pytorch", "habana"]) and any( _string in image for _string in ["synapseai1.3.0", "synapseai1.4.1", "synapseai1.5.0"] ): # Habana Pytorch version looks like 1.10.0a0+gitb488e78 for SynapseAI1.3 PT1.10.1 images pt_fw_version_pattern = r"(\d+(\.\d+){1,2}(-rc\d)?)((a0\+git\w{7}))" pt_fw_version_match = re.fullmatch(pt_fw_version_pattern, output) # This is desired for PT1.10.1 images assert ( tag_framework_version.rsplit(".", 1)[0] == pt_fw_version_match.group(1).rsplit(".", 1)[0] ) else: assert tag_framework_version == output stop_and_remove_container(container_name, ctx) @pytest.mark.usefixtures("sagemaker", "huggingface") @pytest.mark.model("N/A") def test_framework_and_neuron_sdk_version(neuron): """ Gets the neuron sdk tag from the image. For that neuron sdk and the frame work version from the image, it gets the expected frame work version. Then checks that the expected framework version same as the one on a running container. This function test only Neuron images. :param image: ECR image URI """ image = neuron tested_framework, tag_framework_version = get_framework_and_version_from_tag(image) neuron_sdk_version = get_neuron_sdk_version_from_tag(image) assert neuron_sdk_version is not None, "missing Neuron SDK version" release_manifest = get_neuron_release_manifest(neuron_sdk_version) # Framework name may include huggingface if tested_framework.startswith("huggingface_"): tested_framework = tested_framework[len("huggingface_") :] package_name = None if tested_framework == "pytorch": if "training" in image or "neuronx" in image: tested_framework = "torch_neuronx" package_name = "torch-neuronx" else: tested_framework = "torch_neuron" package_name = "torch-neuron" elif tested_framework == "tensorflow": if "neuronx" in image: tested_framework = "tensorflow_neuronx" package_name = "tensorflow-neuronx" else: tested_framework = "tensorflow_neuron" package_name = "tensorflow-neuron" elif tested_framework == "mxnet": tested_framework = "mxnet" package_name = "mxnet_neuron" ctx = Context() assert ( package_name in release_manifest ), f"release_manifest does not contain package {package_name}:\n {json.dumps(release_manifest)}" container_name = get_container_name("framework-version-neuron", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.__version__)", executable="python", ) installed_framework_version = output.stdout.strip() assert installed_framework_version in release_manifest[package_name], ( f"framework {tested_framework} version {installed_framework_version} " f"not found in released versions for that package: {release_manifest[package_name]}" ) stop_and_remove_container(container_name, ctx) @pytest.mark.usefixtures("sagemaker", "huggingface") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", ["p3.2xlarge"], indirect=True) def test_framework_and_cuda_version_gpu(gpu, ec2_connection): """ Check that the framework and cuda version in the image tag is the same as the one on a running container. :param gpu: ECR image URI with "gpu" in the name :param ec2_connection: fixture to establish connection with an ec2 instance """ image = gpu tested_framework, tag_framework_version = get_framework_and_version_from_tag(image) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference", image_repo_name) and Version( tag_framework_version ) == Version("2.6.3"): pytest.skip( "Skipping this test for TF 2.6.3 inference as the v2.6.3 version is already on production" ) # Framework Version Check # # For tf inference containers, check TF model server version if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference(-eia|-graviton)?", image_repo_name): cmd = f"tensorflow_model_server --version" output = ec2.execute_ec2_training_test(ec2_connection, image, cmd, executable="bash").stdout assert re.match( rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output ), f"Cannot find model server version {tag_framework_version} in {output}" else: # Framework name may include huggingface if any( [tested_framework.startswith(prefix) for prefix in ["huggingface_", "stabilityai_"]] ): tested_framework = "_".join(tested_framework.split("_")[1:]) # Replace the trcomp string as it is extracted from ECR repo name tested_framework = tested_framework.replace("_trcomp", "") # Framework name may include trcomp if "trcomp" in tested_framework: # Replace the trcomp string as it is extracted from ECR repo name tested_framework = tested_framework.replace("_trcomp", "") # Module name is "torch" if tested_framework == "pytorch": tested_framework = "torch" elif tested_framework == "autogluon": tested_framework = "autogluon.core" cmd = f"import {tested_framework}; print({tested_framework}.__version__)" output = ec2.execute_ec2_training_test( ec2_connection, image, cmd, executable="python" ).stdout.strip() if is_canary_context(): assert tag_framework_version in output else: if tested_framework == "autogluon.core": # If tag and framework are not matching: # version_to_check = "0.3.1" if tag_framework_version == "0.3.2" else tag_framework_version # assert output.stdout.strip().startswith(version_to_check) pass elif tested_framework == "torch" and Version(tag_framework_version) >= Version( "1.10.0" ): if is_nightly_context(): torch_version_pattern = r"{torch_version}(\+cu\d+|\.dev\d+)".format( torch_version=tag_framework_version ) assert re.fullmatch(torch_version_pattern, output), ( f"torch.__version__ = {output} does not match {torch_version_pattern}\n" f"Please specify nightly framework version as X.Y.Z.devYYYYMMDD" ) else: if ( Version(tag_framework_version) >= Version("2.0.0") and "training" in image_repo_name ): cuda_output = ec2.execute_ec2_training_test( ec2_connection, image, 'import torch; print(torch.version.cuda.replace(".", ""));', executable="python", container_name="PT2", ).stdout.strip() cuda_ver = get_cuda_version_from_tag(image) torch_version_pattern = r"{torch_version}".format( torch_version=tag_framework_version ) assert ( output == tag_framework_version ), f"torch.__version__ = {output} does not match {torch_version_pattern}\n" assert ( cuda_ver == "cu" + cuda_output ), f"torch.version.cuda {cuda_ver} doesn't match {cuda_output}" else: torch_version_pattern = r"{torch_version}(\+cu\d+)".format( torch_version=tag_framework_version ) assert re.fullmatch(torch_version_pattern, output), ( f"torch.__version__ = {output} does not match {torch_version_pattern}\n" f"Please specify framework version as X.Y.Z+cuXXX" ) else: assert tag_framework_version == output # CUDA Version Check # cuda_version = re.search(r"-cu(\d+)-", image).group(1) # MXNet inference/HF tensorflow inference and Autogluon containers do not currently have nvcc in /usr/local/cuda/bin, so check symlink if ( "mxnet-inference" in image or "autogluon" in image or "huggingface-tensorflow-inference" in image ): cuda_cmd = "readlink -f /usr/local/cuda" else: cuda_cmd = "nvcc --version" cuda_output = ec2.execute_ec2_training_test( ec2_connection, image, cuda_cmd, container_name="cuda_version_test" ) # Ensure that cuda version in tag is in the container assert cuda_version in cuda_output.stdout.replace(".", "") @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") def test_dataclasses_check(image): """ Ensure there is no dataclasses pip package is installed for python 3.7 and above version. Python version retrieved from the ecr image uri is expected in the format `py` :param image: ECR image URI """ ctx = Context() pip_package = "dataclasses" container_name = get_container_name("dataclasses-check", image) python_version = get_python_version_from_image_uri(image).replace("py", "") python_version = int(python_version) if python_version >= 37: start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, f"pip show {pip_package}", warn=True) if output.return_code == 0: pytest.fail( f"{pip_package} package exists in the DLC image {image} that has py{python_version} version which is greater than py36 version" ) else: LOGGER.info(f"{pip_package} package does not exists in the DLC image {image}") else: pytest.skip( f"Skipping test for DLC image {image} that has py36 version as {pip_package} is not included in the python framework" ) @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") def test_pip_check(image): """ Ensure there are no broken requirements on the containers by running "pip check" :param image: ECR image URI """ ctx = Context() gpu_suffix = "-gpu" if "gpu" in image else "" allowed_exception_list = [] # SageMaker Python SDK updated its pyyaml requirement to 6.0, which is incompatible with the # requirement from awscli. awscli only requires pyyaml for ecs/eks related invocations, while # pyyaml usage seems to be more fundamental in sagemaker. Therefore, we are ignoring awscli's # requirement in favor of sagemaker. allowed_awscli_exception = re.compile( r"^awscli \d+(\.\d+)* has requirement PyYAML<5\.5,>=3\.10, but you have pyyaml 6\.0.$" ) allowed_exception_list.append(allowed_awscli_exception) # TF inference containers do not have core tensorflow installed by design. Allowing for this pip check error # to occur in order to catch other pip check issues that may be associated with TF inference # smclarify binaries have s3fs->aiobotocore dependency which uses older version of botocore. temporarily # allowing this to catch other issues allowed_tf_exception = re.compile( rf"^tensorflow-serving-api{gpu_suffix} \d\.\d+\.\d+ requires tensorflow(|{gpu_suffix}), which is not installed.$" ) allowed_exception_list.append(allowed_tf_exception) allowed_smclarify_exception = re.compile( r"^aiobotocore \d+(\.\d+)* has requirement botocore<\d+(\.\d+)*,>=\d+(\.\d+)*, " r"but you have botocore \d+(\.\d+)*\.$" ) allowed_exception_list.append(allowed_smclarify_exception) # The v0.22 version of tensorflow-io has a bug fixed in v0.23 https://github.com/tensorflow/io/releases/tag/v0.23.0 allowed_habana_tf_exception = re.compile( rf"^tensorflow-io 0.22.0 requires tensorflow, which is not installed.$" ) allowed_exception_list.append(allowed_habana_tf_exception) framework, framework_version = get_framework_and_version_from_tag(image) # The v0.21 version of tensorflow-io has a bug fixed in v0.23 https://github.com/tensorflow/io/releases/tag/v0.23.0 tf263_io21_issue_framework_list = [ "tensorflow", "huggingface_tensorflow", "huggingface_tensorflow_trcomp", ] if framework in tf263_io21_issue_framework_list or Version(framework_version) in SpecifierSet( ">=2.6.3,<2.7" ): allowed_tf263_exception = re.compile( rf"^tensorflow-io 0.21.0 requires tensorflow, which is not installed.$" ) allowed_exception_list.append(allowed_tf263_exception) # TF2.9 sagemaker containers introduce tf-models-official which has a known bug where in it does not respect the # existing TF installation. https://github.com/tensorflow/models/issues/9267. This package in turn brings in # tensorflow-text. Skip checking these two packages as this is an upstream issue. if framework in ["tensorflow", "huggingface_tensorflow"] and Version( framework_version ) in SpecifierSet(">=2.9.1"): exception_strings = [] models_versions = ["2.9.1", "2.9.2", "2.10.0", "2.11.0", "2.12.0"] for ex_ver in models_versions: exception_strings += [f"tf-models-official {ex_ver}".replace(".", "\.")] text_versions = ["2.9.0", "2.10.0", "2.11.0", "2.12.0"] for ex_ver in text_versions: exception_strings += [f"tensorflow-text {ex_ver}".replace(".", "\.")] allowed_tf_models_text_exception = re.compile( rf"^({'|'.join(exception_strings)}) requires tensorflow, which is not installed." ) allowed_exception_list.append(allowed_tf_models_text_exception) allowed_tf_models_text_compatibility_exception = re.compile( rf"tf-models-official 2.9.2 has requirement tensorflow-text~=2.9.0, but you have tensorflow-text 2.10.0." ) allowed_exception_list.append(allowed_tf_models_text_compatibility_exception) if "pytorch" in image and "trcomp" in image: allowed_exception_list.append( re.compile(r"torch-xla \d+(\.\d+)* requires absl-py, which is not installed.") ) allowed_exception_list.append( re.compile(r"torch-xla \d+(\.\d+)* requires cloud-tpu-client, which is not installed.") ) # Add null entrypoint to ensure command exits immediately output = ctx.run(f"docker run --entrypoint='' {image} pip check", hide=True, warn=True) if output.return_code != 0: if not ( any( [ allowed_exception.findall(output.stdout) for allowed_exception in allowed_exception_list ] ) ): # Rerun pip check test if this is an unexpected failure ctx.run(f"docker run --entrypoint='' {image} pip check", hide=True) @pytest.mark.usefixtures("sagemaker", "huggingface") @pytest.mark.model("N/A") def test_cuda_paths(gpu): """ Test to ensure that: a. buildspec contains an entry to create the same image as the image URI b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip("Skipping Example Dockerfiles which are not explicitly tied to a cuda version") dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework framework, framework_version = get_framework_and_version_from_tag(image) # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_short_version = re.match(r"(\d+\.\d+)", framework_version).group(1) python_version = re.search(r"(py\d+)", image).group(1) short_python_version = None image_tag = re.search( r":(\d+(\.\d+){2}(-(transformers|diffusers)\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-ec2)?-example|-ec2|-sagemaker-lite|-sagemaker-full|-sagemaker)?)", image, ).group(1) # replacing '_' by '/' to handle huggingface_ case framework = framework.replace("_trcomp", "") framework_path = framework.replace("_", "/") framework_version_path = os.path.join( dlc_path, framework_path, job_type, "docker", framework_version ) if not os.path.exists(framework_version_path): framework_version_path = os.path.join( dlc_path, framework_path, job_type, "docker", framework_short_version ) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist short_python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec" if is_tf_version("1", image): buildspec = "buildspec-tf1" if "trcomp" in image: buildspec = "buildspec-trcomp" if "sagemaker-lite" in image: buildspec = "buildspec-sagemaker-lite" image_tag_in_buildspec = False dockerfile_spec_abs_path = None buildspec_path = construct_buildspec_path( dlc_path, framework_path, buildspec, framework_version, job_type ) buildspec_def = Buildspec() buildspec_def.load(buildspec_path) for name, image_spec in buildspec_def["images"].items(): if image_spec["device_type"] == "gpu" and image_spec["tag"] == image_tag: image_tag_in_buildspec = True dockerfile_spec_abs_path = os.path.join( os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/") ) break try: assert image_tag_in_buildspec, f"Image tag {image_tag} not found in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise image_properties_expected_in_dockerfile_path = [ framework_short_version or framework_version, short_python_version or python_version, cuda_version, ] assert all( prop in dockerfile_spec_abs_path for prop in image_properties_expected_in_dockerfile_path ), ( f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in " f"{image_properties_expected_in_dockerfile_path}" ) assert os.path.exists( dockerfile_spec_abs_path ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}" def _assert_artifact_free(output, stray_artifacts): """ Manage looping through assertions to determine that directories don't have known stray files. :param output: Invoke result object :param stray_artifacts: List of things that should not be present in these directories """ for artifact in stray_artifacts: assert not re.search( artifact, output.stdout ), f"Matched {artifact} in {output.stdout} while running {output.command}" def _test_sm_toolkit_and_ts_version(image, region): """ @param image: ECR image URI Make sure SM inference toolkit and torchserve versions match docker image label. """ cmd_smkit = "pip show sagemaker-pytorch-inference | grep -i Version" cmd_ts = "torchserve --version" ctx = Context() container_name = get_container_name("pytorch-smtoolkit-ts-check", image) start_container(container_name, image, ctx) # Get inference tool kit and torchserve version from bash command. output_smkit = run_cmd_on_container(container_name, ctx, cmd_smkit, executable="bash") tk_match = re.search(r"(\d+\.\d+\.\d+)", str(output_smkit.stdout)) if tk_match: toolkit_version_from_output = tk_match.group(0) else: raise RuntimeError( f"Can not determine inference tool kit version from container output : {str(output_smkit.stdout)}" ) output_ts = run_cmd_on_container(container_name, ctx, cmd_ts, executable="bash") ts_match = re.search(r"(\d+\.\d+\.\d+)", str(output_ts.stdout)) if ts_match: ts_version_from_output = ts_match.group(0) else: raise RuntimeError( f"Can not determine torchserve version from container output : {str(output_ts.stdout)}" ) # Verify image label image_labels = get_labels_from_ecr_image(image, region) expected_label = f"com.amazonaws.ml.engines.sagemaker.dlc.inference-toolkit.{toolkit_version_from_output}.torchserve.{ts_version_from_output}" has_expected_label = image_labels.get(expected_label) assert ( has_expected_label ), f"The label {expected_label} which enforces compatability between sagemaker inference toolkit and torchserve seems to be invalid/missing for the image {image}" @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("oss_compliance") @pytest.mark.model("N/A") @pytest.mark.skipif( not is_dlc_cicd_context(), reason="We need to test OSS compliance only on PRs and pipelines" ) def test_oss_compliance(image): """ Run oss compliance check on a container to check if license attribution files exist. And upload source of third party packages to S3 bucket. """ THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses" THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code" file = "THIRD_PARTY_SOURCE_CODE_URLS" container_name = get_container_name("oss_compliance", image) context = Context() local_repo_path = get_repository_local_path() start_container(container_name, image, context) # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile run_cmd_on_container(container_name, context, "/usr/local/bin/testOSSCompliance /root") try: context.run( f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}" ) finally: context.run(f"docker rm -f {container_name}", hide=True) s3_resource = boto3.resource("s3") with open(os.path.join(local_repo_path, file)) as source_code_file: for line in source_code_file: name, version, url = line.split(" ") file_name = f"{name}_v{version}_source_code" s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz" local_file_path = os.path.join(local_repo_path, file_name) for i in range(3): try: if not os.path.isdir(local_file_path): context.run(f"git clone {url.rstrip()} {local_file_path}", hide=True) context.run(f"tar -czvf {local_file_path}.tar.gz {local_file_path}") except Exception as e: time.sleep(1) if i == 2: LOGGER.error(f"Unable to clone git repo. Error: {e}") raise continue try: if os.path.exists(f"{local_file_path}.tar.gz"): LOGGER.info(f"Uploading package to s3 bucket: {line}") s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET, s3_object_path).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": try: # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder. context.run( f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}" ) object = s3_resource.Bucket(THIRD_PARTY_SOURCE_CODE_BUCKET).Object( s3_object_path ) object.Acl().put(ACL="public-read") except ClientError as e: LOGGER.error( f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise else: LOGGER.error( f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_pytorch_training_sm_env_variables(pytorch_training): env_vars = {"SAGEMAKER_TRAINING_MODULE": "sagemaker_pytorch_container.training:main"} container_name_prefix = "pt_training_sm_env" execute_env_variables_test( image_uri=pytorch_training, env_vars_to_test=env_vars, container_name_prefix=container_name_prefix, ) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_pytorch_inference_sm_env_variables(pytorch_inference): env_vars = {"SAGEMAKER_SERVING_MODULE": "sagemaker_pytorch_serving_container.serving:main"} container_name_prefix = "pt_inference_sm_env" execute_env_variables_test( image_uri=pytorch_inference, env_vars_to_test=env_vars, container_name_prefix=container_name_prefix, ) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_tensorflow_training_sm_env_variables(tensorflow_training): env_vars = {"SAGEMAKER_TRAINING_MODULE": "sagemaker_tensorflow_container.training:main"} container_name_prefix = "tf_training_sm_env" execute_env_variables_test( image_uri=tensorflow_training, env_vars_to_test=env_vars, container_name_prefix=container_name_prefix, ) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_tensorflow_inference_sm_env_variables(tensorflow_inference): _, fw_version = get_framework_and_version_from_tag(tensorflow_inference) version_obj = Version(fw_version) tf_short_version = f"{version_obj.major}.{version_obj.minor}" env_vars = {"SAGEMAKER_TFS_VERSION": tf_short_version} container_name_prefix = "tf_inference_sm_env" execute_env_variables_test( image_uri=tensorflow_inference, env_vars_to_test=env_vars, container_name_prefix=container_name_prefix, ) @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.model("N/A") def test_mxnet_training_sm_env_variables(mxnet_training): env_vars = {"SAGEMAKER_TRAINING_MODULE": "sagemaker_mxnet_container.training:main"} container_name_prefix = "mx_training_sm_env" execute_env_variables_test( image_uri=mxnet_training, env_vars_to_test=env_vars, container_name_prefix=container_name_prefix, )