// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
//     http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package windows_test

import (
	"fmt"

	"math/rand"
	"strconv"
	"time"

	"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
	"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"
	"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/utils"
	"github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/manifest"
	"github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/resource/k8s/controller"
	testUtils "github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/utils"

	"github.com/aws/aws-sdk-go/aws"
	"github.com/google/uuid"
	. "github.com/onsi/ginkgo/v2"
	. "github.com/onsi/gomega"
	appsV1 "k8s.io/api/apps/v1"
	batchV1 "k8s.io/api/batch/v1"
	coreV1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
)

// TODO: Add similar testing for Security Group for Pods

// Tests intended to try breaking the controller by testing the system
// under abnormal conditions and above average windows container workloads
// See test/README.md for more details
var _ = Describe("Windows Integration Stress Tests", func() {
	var (
		namespace string
		// targetedNodeLabels to target the same set of nodes for creating
		// jobs and deployments
		targetedNodeLabels map[string]string
		// number of nodes on which the test will be executed
		testNodeCount int
		// the server pod serving http request to client pods
		serverPod *coreV1.Pod
		// the number of retries that each client will perform to the server Pod,
		// if any of the retry fails then the Pod will error out indicating issues
		// with networking
		clientToServerTries         int
		clientToServerRetryInterval int
		clientDeployment            *appsV1.Deployment
		clientJob                   *batchV1.Job
		// label key-vals for deployment and Jobs, these are used to get the Pod object
		// belonging to the deployment/job
		podRoleLabelKey             string
		clientDeploymentPodLabelVal string
		clientJobPodLabelVal        string
		// the maximum number of Pods that can be created on the set of targeted nodes
		maxPodCount int
		// The resource limit which will be injected to the Pod, this is added by
		// the WebHook running in the controller Pod. However, since we are running
		// scenarios where we are killing both controllers (for simplicity sake)
		// we manually inject resource limit for such scenarios. With HA setup in Prod,
		// the running controller will inject the limits
		resourceLimits map[coreV1.ResourceName]resource.Quantity
		err            error
	)

	BeforeEach(func() {
		// For each test, we will create a label with unique ID for the targeted nodes
		// and create test jobs/deployment on these targeted nodes
		targetedNodeLabels = map[string]string{
			"test-node-selector-id": uuid.New().String(),
		}

		maxPodCount = 0
		namespace = "windows-test"
		clientToServerTries = 5
		clientToServerRetryInterval = 2
		testNodeCount = 3

		podRoleLabelKey = "app"
		clientDeploymentPodLabelVal = "windows-deployment"
		clientJobPodLabelVal = "windows-job"

		// For ideal cases where we are not killing both controllers, pass an empty map
		resourceLimits = map[coreV1.ResourceName]resource.Quantity{}
	})

	JustBeforeEach(func() {
		Expect(len(windowsNodeList.Items)).To(BeNumerically(">=", testNodeCount))
		for i := 0; i < testNodeCount; i++ {
			node := windowsNodeList.Items[i]

			capacity, found := node.Status.Allocatable[config.ResourceNameIPAddress]
			Expect(found).To(BeTrue())

			maxPodCount += int(capacity.Value())
		}

		By("adding labels to selected nodes for testing")
		err = frameWork.NodeManager.AddLabels(windowsNodeList.Items[:testNodeCount],
			targetedNodeLabels)
		Expect(err).ToNot(HaveOccurred())

		By("creating the namespace")
		err = frameWork.NSManager.CreateNamespace(ctx, namespace)
		Expect(err).ToNot(HaveOccurred())

		serverContainer := manifest.NewWindowsContainerBuilder().
			Args([]string{GetCommandToStartHttpServer()}).
			Build()

		By("creating and waiting till Windows server Pod runs")
		serverPod, err = manifest.NewWindowsPodBuilder().
			TerminationGracePeriod(0).
			Namespace(namespace).
			Container(serverContainer).
			Name("server").
			Build()
		Expect(err).ToNot(HaveOccurred())

		serverPod, err = frameWork.PodManager.
			CreateAndWaitTillPodIsRunning(ctx, serverPod, testUtils.WindowsPodsCreationTimeout)
		Expect(err).ToNot(HaveOccurred())

		clientContainer := manifest.NewWindowsContainerBuilder().
			Args([]string{GetCommandToContinuouslyTestHostConnectivity(
				serverPod.Status.PodIP, clientToServerTries, clientToServerRetryInterval)}).
			Resources(coreV1.ResourceRequirements{
				Limits:   resourceLimits,
				Requests: resourceLimits,
			}).Build()

		// Append the Node Label Selector to the Windows Pod, this is required
		// for WebHook to inject limits to the Pod
		nodeSelector := utils.CopyMap(targetedNodeLabels)
		nodeSelector[config.NodeLabelOS] = config.OSWindows

		maxPodCount = maxPodCount - 1 // -1 if server Pod is on the test Node

		clientJob = manifest.NewWindowsJob().
			PodLabels("app", "windows-client").
			Parallelism(maxPodCount/2).
			Namespace(namespace).
			Name("client-job").
			PodLabels(podRoleLabelKey, clientJobPodLabelVal).
			Container(clientContainer).
			TerminationGracePeriod(0).
			NodeSelector(nodeSelector).
			Build()

		By("creating the client deployment configuration")
		clientDeployment = manifest.NewWindowsDeploymentBuilder().
			Namespace(namespace).
			Replicas(maxPodCount/2).
			TerminationGracePeriod(0).
			PodLabel(podRoleLabelKey, clientDeploymentPodLabelVal).
			Container(clientContainer).
			Name("client-deployment").
			NodeSelector(nodeSelector).
			Build()
	})

	JustAfterEach(func() {
		By("removing labels on selected nodes for testing")
		err = frameWork.NodeManager.RemoveLabels(windowsNodeList.Items[:testNodeCount],
			targetedNodeLabels)
		Expect(err).ToNot(HaveOccurred())

		By("deleting the namespace")
		err = frameWork.NSManager.DeleteAndWaitTillNamespaceDeleted(ctx, namespace)
		Expect(err).ToNot(HaveOccurred())
	})

	// Negative test to reinforce the positive test are working as intended
	Describe("when connecting to the a non existent server", func() {
		It("all job should fail", func() {
			By("deleting the server Pod")
			err = frameWork.PodManager.DeleteAndWaitTillPodIsDeleted(ctx, serverPod)
			Expect(err).ToNot(HaveOccurred())

			By("creating job to connect to server and verify it fails")
			_, err = frameWork.JobManager.CreateAndWaitForJobToComplete(ctx, clientJob)
			Expect(err).To(HaveOccurred())
		})
	})

	// These set of tests have the following pattern.
	// 1. Create Jobs on given Node for a fixed interval. These are long running
	//    jobs that continuously test networking connectivity to a Windows Server.
	//    The Job fails if connection to server fails. The purpose of the Job is to
	//    Verify that existing Pod networking is not removed when constant operations
	//    are being performed on the same set of Nodes.
	// 2. Create short lived deployment Pods on the same set of Nodes. The purpose of
	//    the deployment is to create continuous operation on the Nodes in the hopes
	//    catching a regression that removes networking of the long lived Jobs.
	// 3. [Optional] Restart the Controller Deployment either forcefully or gracefully
	//    while the test jobs are running deployments are being scaled up and down.
	Describe("when deployment/jobs are continuously scaled up and down", func() {
		// testFinishedChan will be notified when the main test completes, requesting
		// the parallel routines to stop
		var testFinishedChan chan struct{}

		BeforeEach(func() {
			clientToServerTries = 100 // test the client->server request 100 times
			testFinishedChan = make(chan struct{})
		})

		JustBeforeEach(func() {
			// Create and delete deployments at random interval while the long running
			// test runs in the It block
			go CreateAndDeleteDeploymentAtRandomInterval(30, clientDeployment,
				podRoleLabelKey, clientDeploymentPodLabelVal, testFinishedChan)
		})

		JustAfterEach(func() {
			// Signal the chanel to stop running the continuous deployment creation/deletion
			testFinishedChan <- struct{}{}
		})

		Context("[LOCAL] when controller is restarted when deployments are being created", func() {
			// restartForcefully when set would kill the controller forcefully without
			// respecting the grace period
			var restartForcefully bool

			BeforeEach(func() {
				restartForcefully = false
				resourceLimits = map[coreV1.ResourceName]resource.Quantity{
					config.ResourceNameIPAddress: resource.MustParse("1"),
				}
			})

			JustBeforeEach(func() {
				go RestartControllerAtRandomIntervals(30, restartForcefully, testFinishedChan)
			})

			JustAfterEach(func() {
				testFinishedChan <- struct{}{}
			})

			Context("when controller is gracefully restarted", func() {
				It("all Pod should run without error", func() {
					err = CreateJobAndWaitTillCompleted(clientJob, 1)
					Expect(err).ToNot(HaveOccurred())
				})
			})

			Context(" when controller is forcefully restarted in between", func() {
				BeforeEach(func() {
					restartForcefully = true
				})

				It("all Pod should run without error", func() {
					err = CreateJobAndWaitTillCompleted(clientJob, 1)
					Expect(err).ToNot(HaveOccurred())
				})
			})
		})

		Context("when controller is not restarted in between", func() {
			It("all pod should run without error", func() {
				err = CreateJobAndWaitTillCompleted(clientJob, 2)
				Expect(err).ToNot(HaveOccurred())
			})
		})
	})

	Describe("when user manually un-assign secondary IPv4 address", func() {
		BeforeEach(func() {
			testNodeCount = 1
			clientToServerTries = 15
		})

		It("controller should auto recover", func() {
			By("creating a deployment")
			deployment := clientDeployment.DeepCopy()

			By("getting the instance ID of the test Node")
			nodeList := windowsNodeList.Items[:testNodeCount]
			testNode := nodeList[0]
			instanceID := manager.GetNodeInstanceID(&testNode)
			Expect(instanceID).ToNot(Equal(""))

			By("creating a windows deployment")
			deployment, err := frameWork.DeploymentManager.
				CreateAndWaitUntilDeploymentReady(ctx, deployment)
			Expect(err).ToNot(HaveOccurred())

			By("getting the pod on the deployment")
			pods, err := frameWork.PodManager.
				GetPodsWithLabel(ctx, namespace, podRoleLabelKey, clientDeploymentPodLabelVal)
			Expect(err).ToNot(HaveOccurred())

			By("removing secondary IPv4 on a used Pod")
			numIPToRemove := 5
			Expect(len(pods)).To(BeNumerically(">=", numIPToRemove))
			var ipsToRemove []string
			for i := 0; i < numIPToRemove; i++ {
				ipsToRemove = append(ipsToRemove, pods[i].Status.PodIP)
			}
			err = frameWork.EC2Manager.UnAssignSecondaryIPv4Address(instanceID, ipsToRemove)
			Expect(err).ToNot(HaveOccurred())

			By("delete the deployment")
			err = frameWork.DeploymentManager.DeleteAndWaitUntilDeploymentDeleted(ctx, deployment)
			Expect(err).ToNot(HaveOccurred())

			By("creating job that test networking")
			clientJob.Spec.Parallelism = aws.Int32(int32(maxPodCount))
			_, err = frameWork.JobManager.CreateAndWaitForJobToComplete(ctx, clientJob)
			Expect(err).ToNot(HaveOccurred())
		})
	})

	Describe("[LOCAL] when the RBAC policy of the Service Account is modified by user", func() {
		BeforeEach(func() {
			resourceLimits = map[coreV1.ResourceName]resource.Quantity{
				config.ResourceNameIPAddress: resource.MustParse("1"),
			}
			// Enough retries so the Job runs till the controller restarts
			clientToServerTries = 100
		})

		JustAfterEach(func() {
			controller.RevertClusterRoleChanges(frameWork.RBACManager)
		})

		It("controller should not impact existing Pods", func() {

			By("creating a job and waiting it to run")
			_, err = frameWork.JobManager.CreateJobAndWaitForJobToRun(ctx, clientJob)
			Expect(err).ToNot(HaveOccurred())

			controller.PatchClusterRole(frameWork.RBACManager,
				"pods", []string{"get", "watch", "patch"})

			controller.RestartController(ctx, frameWork.ControllerManager,
				frameWork.DeploymentManager)

			// If the networking for the Pod would have been removed the Job would fail
			verify.ExpectPodHaveDesiredPhase(namespace, podRoleLabelKey,
				clientJobPodLabelVal, []coreV1.PodPhase{coreV1.PodSucceeded, coreV1.PodRunning})
		})
	})
})

// RestartControllerAtFixedInterval restarts controller at particular interval, if force
// option is set the controller Pods are restarted forcefully
func RestartControllerAtRandomIntervals(interval int, force bool, done chan struct{}) {
	defer GinkgoRecover() // Since we are using go routines

	By("starting continuous controller restarts")

	for {
		sleepDuration := rand.Intn(interval)
		ticker := time.NewTicker(1 + time.Duration(sleepDuration)*time.Second)

		select {
		case <-ticker.C:
			if force {
				By("restarting the controller forcefully")
				leader := controller.ForcefullyKillControllerPods(ctx, frameWork.ControllerManager,
					frameWork.PodManager)
				Expect(err).ToNot(HaveOccurred())
				By(fmt.Sprintf("new leader pod is %s", leader))
			} else {
				By("restarting the controller with grace period")
				leader := controller.RestartController(ctx, frameWork.ControllerManager, frameWork.DeploymentManager)
				By(fmt.Sprintf("new leader pod is %s", leader))
			}
		case <-done:
			By("stopping continuous controller restarts")
			return
		}
	}
}

func CreateAndDeleteDeploymentAtRandomInterval(maxInterval int, deployment *appsV1.Deployment,
	podLabelKey string, podLabelVal string, done chan struct{}) {
	defer GinkgoRecover()

	By("starting continuous deployment creation and deletion")

	for {
		sleepDuration := rand.Intn(maxInterval)
		ticker := time.NewTicker(1 + time.Duration(sleepDuration)*time.Second)
		select {

		case <-ticker.C:
			newDeployment := deployment.DeepCopy()
			newDeployment, err = frameWork.DeploymentManager.CreateAndWaitUntilDeploymentReady(ctx, newDeployment)
			Expect(err).ToNot(HaveOccurred())

			verify.WindowsPodsHaveIPv4Address(deployment.Namespace, podLabelKey, podLabelVal)
			verify.ExpectPodHaveDesiredPhase(deployment.Namespace, podLabelKey, podLabelVal,
				[]coreV1.PodPhase{coreV1.PodRunning, coreV1.PodSucceeded})

			err = frameWork.DeploymentManager.DeleteAndWaitUntilDeploymentDeleted(ctx, newDeployment)
			Expect(err).ToNot(HaveOccurred())
		case <-done:
			By("stopping continuous deployment creation and deletion")
			return
		}
	}
}

func CreateJobAndWaitTillCompleted(clientJob *batchV1.Job, repetition int) error {
	By("creating jobs that continuously test networking")
	for i := 0; i < repetition; i++ {
		By("creating the client job run" + strconv.Itoa(i))

		job := clientJob.DeepCopy()
		job.Name = fmt.Sprintf("client-job-%d", i)

		_, err := frameWork.JobManager.CreateAndWaitForJobToComplete(ctx, job)
		if err != nil {
			return err
		}
	}
	return nil
}