#!/bin/bash

source common.sh
source inject_tests.sh

# Applies each of the resources needed for the canary tests.
# Parameter:
#    $1: Namespace of the CRD
function run_canary_tests
{
  local crd_namespace="$1"

  echo "Injecting variables into tests"
  inject_all_variables

  echo "Starting Canary Tests"
  run_test "${crd_namespace}" testfiles/xgboost-mnist-trainingjob.yaml
  run_test "${crd_namespace}" testfiles/kmeans-mnist-processingjob.yaml
  run_test "${crd_namespace}" testfiles/xgboost-mnist-hpo.yaml
  # Special code for batch transform till we fix issue-59
  run_test "${crd_namespace}" testfiles/xgboost-model.yaml
  # We need to get sagemaker model before running batch transform
  verify_test "${crd_namespace}" Model xgboost-model 5m Created
  yq w -i testfiles/xgboost-mnist-batchtransform.yaml "spec.modelName" "$(get_sagemaker_model_from_k8s_model "${crd_namespace}" xgboost-model)"
  run_test "${crd_namespace}" testfiles/xgboost-mnist-batchtransform.yaml 
  run_test "${crd_namespace}" testfiles/xgboost-hosting-deployment.yaml
}

# Applies each of the resources needed for the canary tests.
# Parameter:
#    $1: Namespace of the CRD
function run_canary_tests_china
{
  local crd_namespace="$1"

  echo "Injecting variables into tests"
  inject_all_variables

  echo "Starting Canary Tests China"
  run_test "${crd_namespace}" testfiles/xgboost-mnist-trainingjob-china.yaml
  run_test "${crd_namespace}" testfiles/xgboost-mnist-hpo-china.yaml
  # Special code for batch transform till we fix issue-59
  run_test "${crd_namespace}" testfiles/xgboost-model-china.yaml
  # We need to get sagemaker model before running batch transform
  verify_test "${crd_namespace}" Model xgboost-model-china 5m Created
  yq w -i testfiles/xgboost-mnist-batchtransform-china.yaml "spec.modelName" "$(get_sagemaker_model_from_k8s_model "${crd_namespace}" xgboost-model-china)"
  run_test "${crd_namespace}" testfiles/xgboost-mnist-batchtransform-china.yaml
}

# Applies each of the resources needed for the integration tests.
# Parameter:
#    $1: Namespace of the CRD
function run_integration_tests
{
  local crd_namespace="$1"
  run_canary_tests "${crd_namespace}"

  # TODO: Automate creation/testing of EFS file systems for relevant jobs
  # Build prerequisite resources
  if [ "$FSX_ID" == "" ]; then
    echo "Skipping build_fsx_from_s3 as fsx tests are disabled"
    #build_fsx_from_s3
  fi

  echo "Starting integration tests"
  run_test "${crd_namespace}" testfiles/spot-xgboost-mnist-trainingjob.yaml
  run_test "${crd_namespace}" testfiles/xgboost-mnist-custom-endpoint.yaml
  # run_test "${crd_namespace}" testfiles/efs-xgboost-mnist-trainingjob.yaml
  # run_test "${crd_namespace}" testfiles/fsx-kmeans-mnist-trainingjob.yaml
  run_test "${crd_namespace}" testfiles/spot-xgboost-mnist-hpo.yaml
  run_test "${crd_namespace}" testfiles/xgboost-mnist-hpo-custom-endpoint.yaml
  run_test "${crd_namespace}" testfiles/xgboost-mnist-trainingjob-debugger.yaml
  run_test "${crd_namespace}" testfiles/xgboost-hosting-deployment-multi-container.yaml
  run_hap_test "${crd_namespace}" named-xgboost-hosting testfiles/xgboost-hostingautoscaling.yaml testfiles/xgboost-hostingautoscaling-custom.yaml
  run_test "${crd_namespace}" testfiles/hd-retain-varient-properties.yaml
}

# Verifies that all resources were created and are running/completed for the canary tests.
# Parameter:
#    $1: Namespace of the CRD
function verify_canary_tests
{
  local crd_namespace="$1"
  echo "Verifying canary tests"
  verify_test "${crd_namespace}" TrainingJob xgboost-mnist 20m Completed
  verify_test "${crd_namespace}" ProcessingJob kmeans-mnist 20m Completed
  verify_test "${crd_namespace}" HyperparameterTuningJob xgboost-mnist-hpo 20m Completed
  verify_test "${crd_namespace}" BatchTransformJob xgboost-batch 20m Completed 
  verify_test "${crd_namespace}" HostingDeployment xgboost-hosting 90m InService
}

# Verifies that all resources were created and are running/completed for the canary tests.
# Parameter:
#    $1: Namespace of the CRD
function verify_canary_tests_china
{
  local crd_namespace="$1"
  echo "Verifying canary tests"
  verify_test "${crd_namespace}" TrainingJob xgboost-mnist-china 20m Completed
  verify_test "${crd_namespace}" HyperparameterTuningJob xgboost-mnist-hpo-china 20m Completed
  verify_test "${crd_namespace}" BatchTransformJob xgboost-batch-china 20m Completed
}

# Verifies that all resources were created and are running/completed for the integration tests.
# Parameter:
#    $1: Namespace of the CRD
function verify_integration_tests
{
  local crd_namespace="$1"
  echo "Verifying integration tests"
  verify_canary_tests "${crd_namespace}"

  verify_test "${crd_namespace}" TrainingJob spot-xgboost-mnist 20m Completed
  verify_test "${crd_namespace}" TrainingJob xgboost-mnist-custom-endpoint 20m Completed
  # verify_test "${crd_namespace}" TrainingJob efs-xgboost-mnist 20m Completed
  # verify_test "${crd_namespace}" TrainingJob fsx-kmeans-mnist 20m Completed
  verify_test "${crd_namespace}" HyperparameterTuningJob spot-xgboost-mnist-hpo 20m Completed
  verify_test "${crd_namespace}" HyperparameterTuningJob xgboost-mnist-hpo-custom-endpoint 20m Completed
  verify_test "${crd_namespace}" HostingDeployment xgboost-hosting-multi-container 90m InService
  verify_test "${crd_namespace}" TrainingJob xgboost-mnist-debugger 20m Completed
  # Verify that debug job has status
  verify_debug_test "${crd_namespace}" TrainingJob xgboost-mnist-debugger 20m NoIssuesFound
  verify_test "${crd_namespace}" HostingAutoscalingPolicy hap-predefined 5m Created
  verify_test "${crd_namespace}" HostingAutoscalingPolicy hap-custom-metric 5m Created
  # verify_hap_test "3"
  verify_retain_varient_properties "${crd_namespace}" testfiles/hd-retain-varient-properties.yaml
}

function verify_retain_varient_properties(){
  local crd_namespace="$1"
  local hostingdeployment_yaml_filepath="$2"
  local autoscaling_yaml_filepath="testfiles/hd-autoscaling-retain-varient-properties.yaml"

  echo "Varifying retain varient properties"

  # endpoint is already created with instance count 1 and instance weight 2
  enpoint_name=$(yq r $hostingdeployment_yaml_filepath "spec.endpointName")
  endpoint_region=$(yq r $hostingdeployment_yaml_filepath "spec.region")
  hostingdeployment_name=$(yq r $hostingdeployment_yaml_filepath "metadata.name")
  wait_for_crd_status $crd_namespace HostingDeployment $hostingdeployment_name 90m InService

  # verify that the already created endpoint has instance count 1 and weight 2
  instance_count=$(aws sagemaker describe-endpoint --endpoint-name $enpoint_name --region $endpoint_region --query ProductionVariants[0].CurrentInstanceCount)
  weight=$(aws sagemaker describe-endpoint --endpoint-name $enpoint_name --region $endpoint_region --query ProductionVariants[0].CurrentWeight | awk '{printf "%.0f\n", $1}')
  if [ "${instance_count}" == "1" ] && [ "${weight}" == "2" ]; then
    echo "Initial verification that instance_count is 1 and weight is 2"
  else
    echo "[FAILED] Ininitial hosting deployment has wrong properties. Maybe the default namespace test didn't reset the values"
    echo "instance_count: $instance_count, weight: $weight"
    exit 1
  fi

  # autoscalling increases instance count to 2 (while keeping the previous weight 2)
  yq w -i $autoscaling_yaml_filepath "spec.resourceId[0].endpointName" $enpoint_name
  kubectl apply -n $crd_namespace -f $autoscaling_yaml_filepath
  wait_for_crd_status $crd_namespace HostingDeployment $hostingdeployment_name 90m Updating
  wait_for_crd_status $crd_namespace HostingDeployment $hostingdeployment_name 90m InService

  # verify instance count is 2
  instance_count=$(aws sagemaker describe-endpoint --endpoint-name $enpoint_name --region $endpoint_region --query ProductionVariants[0].CurrentInstanceCount)
  if [ "${instance_count}" == "2" ]; then
    echo "Autoscalling applied to increase instance count to 2"
  else
    echo "[FAILED] Failed to apply autoscaling policy in retain varient properties test"
    echo "instance_count: $instance_count"
    exit 1
  fi

  # re-apply hosting deployment with instance count 1 and instance weight 1
  # retainAllVariantProperties "true" and excludeRetainedVariantProperties "DesiredWeight"
  #
  # This should retain the previous instance count of 2 and
  # shouldn't retain the previous instance weight of 2 and change it to 3
  yq w -i $hostingdeployment_yaml_filepath "spec.productionVariants[0].initialVariantWeight" 3
  kubectl apply -n $crd_namespace -f $hostingdeployment_yaml_filepath
  wait_for_crd_status $crd_namespace HostingDeployment $hostingdeployment_name 90m Updating
  wait_for_crd_status $crd_namespace HostingDeployment $hostingdeployment_name 90m InService
  # Check if it retained the previous instance count and did not retain previous weight
  # sleep 5 && wait_for_crd_status "${crd_namespace}" HostingDeployment $hostingdeployment_name 40m InService && sleep 5
  instance_count=$(aws sagemaker describe-endpoint --endpoint-name $enpoint_name --region $endpoint_region --query ProductionVariants[0].CurrentInstanceCount)
  weight=$(aws sagemaker describe-endpoint --endpoint-name $enpoint_name --region $endpoint_region --query ProductionVariants[0].CurrentWeight | awk '{printf "%.0f\n", $1}')
  if [ "${instance_count}" == "2" ] && [ "${weight}" == "3" ]; then
    echo "[PASSED] Successfully tested retainAllVariantProperties and excludeRetainedVariantProperties"
  else
    echo "[FAILED] Failed to retain the variant properties"
    echo "instance_count: $instance_count, weight: $weight"
    exit 1
  fi

  # changing weight back to 2 for namaspace based tests
  yq w -i $hostingdeployment_yaml_filepath "spec.productionVariants[0].initialVariantWeight" 2
}


# Replaces the names of the endpoint generated in the previous test into the hap spec file and runs the test. 
# Parameter:
#    $1: Target namespace
#    $2: K8s Name and endpointName of the hostingdeployment to apply autoscaling 
#    $3: Filename of the hap test to run
#    $4: Filename of the custom metric hap test to run
function run_hap_test()
{
  local target_namespace="$1"
  local hosting_deployment_1="$2"
  local file_name="$3"
  local file_name_custom="$4"
  local hosting_deployment_2="${hosting_deployment_1}-2"
  local hosting_deployment_3="${hosting_deployment_1}-3"
  local hostingdeployment_type="hostingdeployment"

  # Create the 3 Endpoints
  update_hostingdeployment_input_metadata "${crd_namespace}" "${hosting_deployment_1}" "true"
  update_hostingdeployment_input_metadata "${crd_namespace}" "${hosting_deployment_2}" "true"
  update_hostingdeployment_input_metadata "${crd_namespace}" "${hosting_deployment_3}" "true"

  # This can be removed, but will make debugging easier with the same runtime. 
  verify_test "${crd_namespace}" HostingDeployment "${hosting_deployment_1}" 90m InService
  verify_test "${crd_namespace}" HostingDeployment "${hosting_deployment_2}" 90m InService
  verify_test "${crd_namespace}" HostingDeployment "${hosting_deployment_3}" 90m InService

  # HAP Test 1: Using the Predefined Metric
  yq w -i "$file_name" "spec.resourceId[0].endpointName" "${hosting_deployment_1}"
  yq w -i "$file_name" "spec.resourceId[1].endpointName" "${hosting_deployment_2}"
  run_test "$target_namespace" "$file_name"

  # HAP Test 2: Using the Custom Metric
  yq w -i "$file_name_custom" "spec.resourceId[0].endpointName" "${hosting_deployment_3}"
  yq w -i "$file_name_custom" "spec.targetTrackingScalingPolicyConfiguration.customizedMetricSpecification.dimensions[0].value" "${hosting_deployment_3}"
  run_test "$target_namespace" "$file_name_custom"

  update_hostingdeployment_input_metadata "${crd_namespace}" "${hosting_deployment_1}" "false"
}


# This function verifies that the number of scaling policies applied is as expected 
# Parameter:
#    $1: The expected number of policies
function verify_hap_test()
{
  local expected_number_of_policies="$1"
  scaling_policies="$(aws application-autoscaling describe-scaling-policies --region us-west-2 --service-namespace sagemaker | jq .ScalingPolicies)"
  number_of_policies_applied="$(echo $scaling_policies | jq length)"

  if [ "${number_of_policies_applied}" == "${expected_number_of_policies}" ]; then
    echo "[PASSED] All Scaling Policies were successfully applied to the specified number of endpoints"
  else
    echo "[FAILED] ${number_of_policies_applied} scaling policy/policies were applied. Expected ${expected_number_of_policies}."
    exit 1 
  fi
}

# This function verifies that a given debug job has specific status
# Parameter:
#    $1: CRD namespace
#    $1: CRD type
#    $2: Instance of CRD
#    $3: Single debug job status
function verify_debug_test
{
  local crd_namespace="$1"
  local crd_type="$2"
  local crd_instance="$3"
  local timeout="$4"
  local expected_debug_job_status="$5"
  # First verify that trainingjob has been completed
  verify_test "${crd_namespace}" TrainingJob xgboost-mnist-debugger $timeout Completed

  # TODO extend this for multiple debug job with debug job statuses parameter

  echo "Waiting for debug job to finish"
  timeout "${timeout}" bash -c \
    'until [ "$(kubectl get -n "$0" "$1" "$2" -o json | jq -r .status.debugRuleEvaluationStatuses[0].ruleEvaluationStatus)" == "$3" ]; do \
      echo "Debug job has not completed yet"; \
      sleep 1; \
    done' "${crd_namespace}" "${crd_type}" "${crd_instance}" "${expected_debug_job_status}"

  if [ $? -ne 0 ]; then
    echo "[FAILED] Debug job ${crd_type} ${crd_instance} with expected status ${expected_debug_job_status} has timed out"
    exit 1
  fi

  echo "Debug job has completed"
}

# This function verifies that job has started and not failed
# Parameter:
#    $1: Namespace of the CRD
#    $2: Kind of CRD
#    $3: Instance of CRD
#    $4: Timeout to complete the test
#    $5: The status that verifies the job has succeeded.
# e.g. verify_test default trainingjobs xgboost-mnist
function verify_test()
{
  local crd_namespace="$1"
  local crd_type="$2"
  local crd_instance="$3"
  local timeout="$4"
  local desired_status="$5"

  # Check if job exist
  kubectl get -n "${crd_namespace}" "${crd_type}" "${crd_instance}"
  if [ $? -ne 0 ]; then
    echo "[FAILED] ${crd_type} ${crd_namespace}:${crd_instance} job does not exist"
    exit 1
  fi

  # Wait until Training Job has started, there will be two rows, header(STATUS) and entry.
  # Entry will be none if the job has not started yet. When job starts none will disappear 
  # and real status will be present.
  echo "Waiting for ${crd_type} ${crd_namespace}:${crd_instance} job to start"
  timeout 1m bash -c \
    'until [ "$(kubectl get -n "$0" "$1" "$2" -o=custom-columns=STATUS:.status | grep -i none | wc -l)" -eq "0" ]; do \
      echo "${crd_type} ${crd_namespace}:${crd_instance} job has not started yet"; \
      sleep 1; \
    done' "${crd_namespace}" "${crd_type}" "${crd_instance}"

  # Job has started, check whether it has failed or not
  kubectl get -n "${crd_namespace}" "${crd_type}" "${crd_instance}" -o=custom-columns=NAME:.status | grep -i fail 
  if [ $? -eq 0 ]; then
    echo "[FAILED] ${crd_type} ${crd_namespace}:${crd_instance} job has failed" 
    exit 1
  fi

  echo "Waiting for ${crd_type} ${crd_namespace}:${crd_instance} job to complete"
  if ! wait_for_crd_status "$crd_namespace" "$crd_type" "$crd_instance" "$timeout" "$desired_status"; then
      echo "[FAILED] Waiting for status Completed failed"
      exit 1
  fi

  # Check weather job has completed or not
  echo "[PASSED] Verified ${crd_type} ${crd_namespace}:${crd_instance} has completed"
}

# Build a new FSX file system from an S3 data source to be used by FSX integration tests.
function build_fsx_from_s3()
{
  echo "Building FSX file system from S3 data source"

  NEW_FS="$(aws fsx create-file-system \
  --file-system-type LUSTRE \
  --lustre-configuration ImportPath=s3://${DATA_BUCKET}/kmeans_mnist_example \
  --storage-capacity 1200 \
  --subnet-ids subnet-187e9960 \
  --tags Key="Name",Value="$(date '+%Y-%m-%d-%H-%M-%S')" \
  --region us-west-2)"

  echo $NEW_FS

  FSX_ID="$(echo "$NEW_FS" | jq -r ".FileSystem.FileSystemId")"
  FS_AVAILABLE=CREATING
  until [[ "${FS_AVAILABLE}" != "CREATING" ]]; do
    FS_AVAILABLE="$(aws fsx --region us-west-2 describe-file-systems --file-system-id "${FSX_ID}" | jq -r ".FileSystems[0].Lifecycle")"
    sleep 30
  done
  aws fsx --region us-west-2 describe-file-systems --file-system-id ${FSX_ID}

  if [[ "${FS_AVAILABLE}" != "AVAILABLE" ]]; then
    exit 1
  fi

  export FSX_ID=$FSX_ID
}

# Replaces the names of the endpoint and optionally  creates it
# Parameter:
#    $1: Target namespace
#    $2: K8s Name and endpointName of the hostingdeployment to apply autoscaling 
#    $3: Boolean to indicate if the modified HostingDeployment should be applied
function update_hostingdeployment_input_metadata()
{ 
  local crd_namespace="$1"
  local hosting_deployment="$2"
  local is_create="$3"
  local file_name_hd="testfiles/xgboost-hosting-deployment-with-name.yaml"
  
  yq w -i "${file_name_hd}" "metadata.name" $hosting_deployment
  yq w -i "${file_name_hd}" "spec.endpointName" $hosting_deployment
  
  if [[ "${is_create}" == "true" ]]; then
    run_test "${crd_namespace}" "${file_name_hd}"
  fi
}