# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import os, subprocess, tarfile

import pytest
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig

from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag
from packaging.version import Version
from packaging.specifiers import SpecifierSet
from ...integration import DEFAULT_TIMEOUT
from ...integration.sagemaker.timeout import timeout
import sagemaker
import re

import unittest.mock as mock


hyperparameters = {
    "model_name_or_path": "bert-large-uncased-whole-word-masking",
    "dataset_name": "squad",
    "do_train": True,
    "do_eval": True,
    "fp16": True,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "num_train_epochs": 1,
    "max_seq_length": 384,
    "max_steps": 3,
    "max_eval_samples": 10,
    "pad_to_max_length": True,
    "doc_stride": 128,
    "output_dir": "/opt/ml/model",
}
# metric definition to extract the results
metric_definitions = [
    {"Name": "train_runtime", "Regex": "'train_runtime':\D*([0-9,.]*?)"},
    {"Name": "device", "Regex": "Using\D*([a-zA-Z0-9:]*)\D*device"},
    {"Name": "train_samples_per_second", "Regex": "train_samples_per_second.*=\D*(.*?)$"},
    {"Name": "epoch", "Regex": "epoch.*=\D*(.*?)$"},
    {"Name": "f1", "Regex": "f1.*=\D*(.*?)$"},
    {"Name": "exact_match", "Regex": "exact_match.*=\D*(.*?)$"},
]


def get_transformers_version(ecr_image):
    transformers_version_search = re.search(r"transformers(\d+(\.\d+){1,2})", ecr_image)
    if transformers_version_search:
        transformers_version = transformers_version_search.group(1)
        return transformers_version
    else:
        raise LookupError("HF transformers version not found in image URI")


@pytest.fixture
def instance_type():
    return "ml.p3.2xlarge"


@pytest.fixture
def instance_count():
    return 1


@pytest.fixture
def num_gpus_per_instance(instance_type):
    if instance_type in ["ml.p3.16xlarge", "ml.p4d.24xlarge"]:
        return 8
    elif instance_type in ["ml.g4dn.12xlarge", "ml.g5.12xlarge"]:
        return 4
    raise NotImplementedError("Unforeseen Instance Type")


@pytest.fixture
def should_nccl_use_pcie(instance_type, instance_count, ecr_image):
    """Should NCCL be explicitly forced to use PCIE when NVLINK is not available ? This is baked in from PyTorch 1.12."""
    pytorch_version = get_framework_and_version_from_tag(ecr_image)[1]
    if "g" in instance_type and (Version(pytorch_version) in SpecifierSet("< 1.12")):
        return True
    return False


@pytest.mark.integration("sagmaker-training-compiler")
@pytest.mark.processor("gpu")
@pytest.mark.skip_py2_containers
@pytest.mark.skip_huggingface_containers
@pytest.mark.skip_cpu
@mock.patch("sagemaker.huggingface.TrainingCompilerConfig.validate", return_value=None)
class TestSingleNodeSingleGPU:
    """
    All Single Node Single GPU tests go here.
    """

    @pytest.mark.model("bert-large")
    def test_trcomp_default(
        self,
        patched,
        ecr_image,
        sagemaker_session,
        tmpdir,
        py_version,
        capsys,
        instance_type,
        instance_count,
    ):
        """
        Tests the default configuration of SM trcomp
        """
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            "repo": "https://github.com/huggingface/transformers.git",
            "branch": "v" + transformers_version,
        }

        source_dir = (
            "./examples/question-answering"
            if Version(transformers_version) < Version("4.6")
            else "./examples/pytorch/question-answering"
        )

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(),
                entry_point="run_qa.py",
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role="SageMakerRole",
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(
                job_name=sagemaker.utils.unique_name_from_base("hf-pt-trcomp-SNSG-default"),
                logs=True,
            )
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs

    @pytest.mark.model("bert-large")
    def test_trcomp_enabled(
        self,
        patched,
        ecr_image,
        sagemaker_session,
        tmpdir,
        py_version,
        capsys,
        instance_type,
        instance_count,
    ):
        """
        Tests the explicit enabled configuration of SM trcomp
        """
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            "repo": "https://github.com/huggingface/transformers.git",
            "branch": "v" + transformers_version,
        }

        source_dir = (
            "./examples/question-answering"
            if Version(transformers_version) < Version("4.6")
            else "./examples/pytorch/question-answering"
        )

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(enabled=True),
                entry_point="run_qa.py",
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role="SageMakerRole",
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(
                job_name=sagemaker.utils.unique_name_from_base("hf-pt-trcomp-SNSG-enabled"),
                logs=True,
            )
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs

    @pytest.mark.model("bert-large")
    def test_trcomp_debug(
        self,
        patched,
        ecr_image,
        sagemaker_session,
        tmpdir,
        py_version,
        capsys,
        instance_type,
        instance_count,
    ):
        """
        Tests the debug mode configuration of SM trcomp
        """
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            "repo": "https://github.com/huggingface/transformers.git",
            "branch": "v" + transformers_version,
        }

        source_dir = (
            "./examples/question-answering"
            if Version(transformers_version) < Version("4.6")
            else "./examples/pytorch/question-answering"
        )

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(debug=True),
                entry_point="run_qa.py",
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role="SageMakerRole",
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(
                job_name=sagemaker.utils.unique_name_from_base("hf-pt-trcomp-SNSG-debug"), logs=True
            )

        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Training Compiler set to debug mode" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs

        debug_artifact_path = estimator.model_data.replace("model.tar.gz", "output.tar.gz")
        debug_artifact = os.path.join(tmpdir, "output.tar.gz")
        subprocess.check_output(["aws", "s3", "cp", debug_artifact_path, debug_artifact])
        with tarfile.open(debug_artifact, "r:gz") as tarball:
            tarball.extractall(path=tmpdir)
        xla_metrics_file = os.path.join(tmpdir, "compiler", "XLA_METRICS_FILE.txt")
        assert os.path.exists(xla_metrics_file)


@pytest.mark.integration("sagmaker-training-compiler")
@pytest.mark.processor("gpu")
@pytest.mark.skip_py2_containers
@pytest.mark.skip_huggingface_containers
@pytest.mark.skip_cpu
@mock.patch("sagemaker.huggingface.TrainingCompilerConfig.validate", return_value=None)
class TestSingleNodeMultiGPU:
    """
    All Single Node Multi GPU tests go here.
    """

    @pytest.mark.parametrize(
        "instance_type, instance_count",
        [
            ("ml.p3.16xlarge", 1),
            ("ml.g4dn.12xlarge", 1),
            ("ml.g5.12xlarge", 1),
        ],
    )
    @pytest.mark.model("bert-large")
    def test_trcomp_default(
        self,
        patched,
        ecr_image,
        sagemaker_session,
        tmpdir,
        py_version,
        capsys,
        instance_type,
        instance_count,
        num_gpus_per_instance,
        should_nccl_use_pcie,
    ):
        """
        Tests the default configuration of SM trcomp
        """
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            "repo": "https://github.com/huggingface/transformers.git",
            "branch": "v" + transformers_version,
        }

        source_dir = (
            "./examples/question-answering"
            if Version(transformers_version) < Version("4.6")
            else "./examples/pytorch/question-answering"
        )

        hyperparameters["max_steps"] = 3 * num_gpus_per_instance

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(),
                entry_point="run_qa.py",
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role="SageMakerRole",
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                py_version=py_version,
                max_retry_attempts=15,
                distribution={"pytorchxla": {"enabled": True}},
                environment={"NCCL_P2P_LEVEL": "PXB"}
                if should_nccl_use_pcie
                else {},  # Temporary measure to enable communication through PCIe instead of NVLink
            )
            estimator.fit(
                job_name=sagemaker.utils.unique_name_from_base("hf-pt-trcomp-SNMG-default"),
                logs=True,
            )
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs
        assert "Invoking PT-XLA Runner" in logs
        assert "distributed training through PT-XLA Runtime" in logs
        assert "torch_xla.distributed.xla_spawn" in logs
        assert f"nranks {num_gpus_per_instance}" in logs


@pytest.mark.integration("sagmaker-training-compiler")
@pytest.mark.processor("gpu")
@pytest.mark.skip_py2_containers
@pytest.mark.skip_huggingface_containers
@pytest.mark.skip_cpu
@mock.patch("sagemaker.huggingface.TrainingCompilerConfig.validate", return_value=None)
class TestMultiNodeMultiGPU:
    """
    All Multi Node Multi GPU tests go here.
    """

    @pytest.mark.parametrize(
        "instance_type, instance_count",
        [
            ("ml.p3.16xlarge", 2),
            ("ml.p4d.24xlarge", 2),
            ("ml.g4dn.12xlarge", 2),
            ("ml.g5.12xlarge", 2),
        ],
    )
    @pytest.mark.model("bert-large")
    def test_trcomp_default(
        self,
        patched,
        ecr_image,
        sagemaker_session,
        tmpdir,
        py_version,
        capsys,
        instance_type,
        instance_count,
        num_gpus_per_instance,
        should_nccl_use_pcie,
    ):
        """
        Tests the default configuration of SM trcomp
        """
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            "repo": "https://github.com/huggingface/transformers.git",
            "branch": "v" + transformers_version,
        }

        source_dir = (
            "./examples/question-answering"
            if Version(transformers_version) < Version("4.6")
            else "./examples/pytorch/question-answering"
        )

        total_gpus = num_gpus_per_instance * instance_count
        hyperparameters["max_steps"] = 3 * total_gpus

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(),
                entry_point="run_qa.py",
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role="SageMakerRole",
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                py_version=py_version,
                max_retry_attempts=15,
                distribution={"pytorchxla": {"enabled": True}},
                environment={"NCCL_P2P_LEVEL": "PXB"}
                if should_nccl_use_pcie
                else {},  # Temporary measure to enable communication through PCIe instead of NVLink
            )
            estimator.fit(
                job_name=sagemaker.utils.unique_name_from_base("hf-pt-trcomp-MNMG-default"),
                logs=True,
            )
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs
        assert "Invoking PT-XLA Runner" in logs
        assert "distributed training through PT-XLA Runtime" in logs
        assert "torch_xla.distributed.xla_spawn" in logs
        assert f"nranks {total_gpus}" in logs