import os import pytest import test.test_utils as test_utils from packaging.version import Version from packaging.specifiers import SpecifierSet from test.test_utils import ( CONTAINER_TESTS_PREFIX, LOGGER, is_tf_version, get_framework_and_version_from_tag, is_nightly_context, ) from test.test_utils.ec2 import get_ec2_instance_type SMDEBUG_SCRIPT = os.path.join(CONTAINER_TESTS_PREFIX, "testSmdebug") SMPROFILER_SCRIPT = os.path.join(CONTAINER_TESTS_PREFIX, "testSmprofiler") SMDEBUG_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.8xlarge", processor="gpu") SMDEBUG_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.8xlarge", processor="cpu") @pytest.mark.usefixtures("feature_smdebug_present") @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.integration("smdebug") @pytest.mark.model("mnist") @pytest.mark.parametrize("ec2_instance_type", SMDEBUG_EC2_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.flaky(reruns=0) def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") _, image_framework_version = get_framework_and_version_from_tag(training) if ( "trcomp" in training and "pytorch" in training and Version(image_framework_version) in SpecifierSet("<2.0") ): pytest.skip(f"Image {training} doesn't support s3. Hence test is skipped.") smdebug_test_timeout = 2400 if is_tf_version("1", training): if is_nightly_context(): smdebug_test_timeout = 7200 else: pytest.skip( "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context" ) run_smdebug_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout, ) @pytest.mark.usefixtures("feature_smdebug_present") @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.integration("smprofiler") @pytest.mark.model("mnist") @pytest.mark.parametrize("ec2_instance_type", SMDEBUG_EC2_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.flaky(reruns=0) def test_smprofiler_gpu( training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only, tf23_and_above_only, pt16_and_above_only, ): # Running the profiler tests for pytorch and tensorflow2 frameworks only. # This code needs to be modified past reInvent 2020 if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") _, image_framework_version = get_framework_and_version_from_tag(training) if ( "trcomp" in training and "pytorch" in training and Version(image_framework_version) in SpecifierSet("<2.0") ): pytest.skip(f"Image {training} doesn't support s3. Hence test is skipped.") framework = get_framework_from_image_uri(training) if framework not in ["pytorch", "tensorflow2"]: return smdebug_test_timeout = 2400 run_smprofiler_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout, ) @pytest.mark.usefixtures("feature_smdebug_present") @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.flaky(reruns=0) @pytest.mark.integration("smdebug") @pytest.mark.model("mnist") @pytest.mark.parametrize("ec2_instance_type", SMDEBUG_EC2_CPU_INSTANCE_TYPE, indirect=True) def test_smdebug_cpu(training, ec2_connection, region, ec2_instance_type, cpu_only, py3_only): run_smdebug_test(training, ec2_connection, region, ec2_instance_type) @pytest.mark.usefixtures("feature_smdebug_present") @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.flaky(reruns=0) @pytest.mark.integration("smdebug") @pytest.mark.model("mnist") @pytest.mark.parametrize("ec2_instance_type", SMDEBUG_EC2_CPU_INSTANCE_TYPE, indirect=True) def test_smprofiler_cpu( training, ec2_connection, region, ec2_instance_type, cpu_only, py3_only, tf23_and_above_only, pt16_and_above_only, ): # Running the profiler tests for pytorch and tensorflow2 frameworks only. # This code needs to be modified past reInvent 2020 framework = get_framework_from_image_uri(training) if framework not in ["pytorch", "tensorflow2"]: return run_smprofiler_test(training, ec2_connection, region, ec2_instance_type) class SMDebugTestFailure(Exception): pass def run_smdebug_test( image_uri, ec2_connection, region, ec2_instance_type, docker_executable="docker", container_name="smdebug", test_script=SMDEBUG_SCRIPT, timeout=2400, ): large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} " f"./{test_script} {framework}", hide=True, timeout=timeout, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "All SMDebug tests succeeded!" in debug_stdout: LOGGER.warning( f"SMDebug tests succeeded, but there is an issue with fabric:\n{e}:\nTest output:\n{debug_stdout}" ) return raise SMDebugTestFailure( f"SMDebug test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e def run_smprofiler_test( image_uri, ec2_connection, region, ec2_instance_type, docker_executable="docker", container_name="smdebug", test_script=SMPROFILER_SCRIPT, timeout=2400, ): large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} " f"./{test_script} {framework}", hide=True, timeout=timeout, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "All SMprofiler tests succeeded!" in debug_stdout: LOGGER.warning( f"SMProfiler tests succeeded, but there is an issue with fabric:\n{e}:\nTest output:\n{debug_stdout}" ) return raise SMDebugTestFailure( f"SMProfiler test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e def get_framework_from_image_uri(image_uri): frameworks = ("tensorflow", "mxnet", "pytorch") for framework in frameworks: if framework in image_uri: if framework == "tensorflow" and is_tf_version("2", image_uri): return "tensorflow2" return framework raise RuntimeError(f"Could not find any framework {frameworks} in {image_uri}")