# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 locals { kinesis_scaling_function_name = "kinesis-scaling" kinesis_period_mins = 5 # This value is used here and in stream.tf alarms kinesis_period_secs = 60 * local.kinesis_period_mins # This value is used here and in stream.tf alarms kinesis_scale_up_threshold = 0.75 # This value is used here and in stream.tf alarms kinesis_scale_up_evaluation_period = 25 / local.kinesis_period_mins # This value is used here and in stream.tf alarms kinesis_scale_up_datapoints_required = 25 / local.kinesis_period_mins # This value is used here and in stream.tf alarms kinesis_scale_down_threshold = 0.25 # This value is used here and in stream.tf alarms kinesis_scale_down_evaluation_period = 300 / local.kinesis_period_mins # This value is used here and in stream.tf alarms kinesis_scale_down_datapoints_required = 285 / local.kinesis_period_mins # This value is used here and in stream.tf alarms kinesis_scale_down_min_iter_age_mins = 30 # This value is used here and in stream.tf alarms kinesis_fatal_error_metric_name = "FATAL_ERROR_KINESIS_SCALING" # This value is used here and in stream.tf alarms # Note: There must always be at least a 2 datapoint difference between the scale-up and scaled-down datapoints. # >>> Scale-Up requires 5 out of 5 data points (consecutive) ... 25/5 = _5_ # >>> Scale-Down requires 57 out of 60 data points (non-consecutive) ... 285/5 = 57 and 300/5 = 60, 60-57 = _3_ # The 2 datapoint difference references 60-57 = _3_ (scale-down) vs _5_ (scale-up). The reasoning for this difference is that you should never # be able to trigger a scale down immediately after a scale up. Say your throughput spiked just enough to scale up, then went away. Having > 3 # consecutive datapoints above the scale-up threshold to trigger a scale-up means it's impossible to get 57 of 60 datapoints until you wait 300 # minutes and those scale-up data points age out of the scale-down window. # Ok now the really confusing part: The reason 1 datapoint difference isn't enough is because we have 2 alarms and they can operate on slightly # different time alignments, internally, for calculating their datapoints, despite always showing datapoints at the 5 minute marks (12:05, 12:10 # etc) in the graph. So you add 1 extra datapoint of difference, for a total of 2, to guard against this potential misalignment between the scale # up and scale-down alarms. You can see this internal difference by going to the alarm and looking at its history for an entry like "Alarm updated # from OK to In alarm" and clicking the time "2020-06-23 12:47:09" link on the row. The "newState" section will show the datapoints and their times # > "stateReason": "Threshold Crossed: 5 out of the last 5 datapoints [0.9162957064310709 (23/06/20 12:42:00), 0.934598798751831 (23/06/20 12:37:00... # As you can see these datapoints are not aligned along the 5 minute display boundary (12:05, 12:10) but rather 12:37, 12:42. kinesis_consumer_lambda_name = "$kinesis-consumer-lambda" kinesis_consumer_lambda_arn = "arn:aws:lambda:${local.region}:${local.account_id}:function:${local.kinesis_consumer_lambda_name}" kinesis_consumer_lambdas_per_shard = 5 # Note: Max is 10, you can max it out if a stream can't catch up. # Remove the kinesis_consumer_lambda ignore block for reserved_concurrent_executions if you change # this value or it won't apply when you deploy. } ################################## # Kinesis Auto-Scaling Lambda # To create the source_file for the golang lambda function run the command on the next line from the project ROOT. # GOOS=linux go build -o main golang/scale.go ################################## data archive_file kinesis_scaling_function_zip { type = "zip" source_file = "../main" output_path = "../kinesis_scaling.zip" } resource aws_lambda_function kinesis_scaling_function { filename = data.archive_file.kinesis_scaling_function_zip.output_path function_name = local.kinesis_scaling_function_name handler = "main" role = aws_iam_role.kinesis_scaling_lambda_role.arn runtime = "go1.x" source_code_hash = data.archive_file.kinesis_scaling_function_zip.output_base64sha256 timeout = 900 memory_size = 512 reserved_concurrent_executions = 1 environment { variables = { SCALE_PERIOD_MINS = local.kinesis_period_mins SCALE_UP_THRESHOLD = local.kinesis_scale_up_threshold SCALE_UP_EVALUATION_PERIOD = local.kinesis_scale_up_evaluation_period SCALE_UP_DATAPOINTS_REQUIRED = local.kinesis_scale_up_datapoints_required SCALE_DOWN_THRESHOLD = local.kinesis_scale_down_threshold SCALE_DOWN_EVALUATION_PERIOD = local.kinesis_scale_down_evaluation_period SCALE_DOWN_DATAPOINTS_REQUIRED = local.kinesis_scale_down_datapoints_required SCALE_DOWN_MIN_ITER_AGE_MINS = local.kinesis_scale_down_min_iter_age_mins PROCESSING_LAMBDA_ARN = local.kinesis_consumer_lambda_arn PROCESSING_LAMBDAS_PER_SHARD = local.kinesis_consumer_lambdas_per_shard THROTTLE_RETRY_MIN_SLEEP = 1 THROTTLE_RETRY_MAX_SLEEP = 3 THROTTLE_RETRY_COUNT = 30 DRY_RUN = "false" } } } resource aws_lambda_function_event_invoke_config kinesis_scaling_function_async_config { function_name = aws_lambda_function.kinesis_scaling_function.function_name maximum_retry_attempts = 0 # We do not want any retries of the scaling function if it errors out, alarms will re-trigger it } resource aws_cloudwatch_metric_alarm kinesis_scaling_fatal_errors { alarm_name = "${local.kinesis_scaling_function_name}-fatal-errors" comparison_operator = "GreaterThanThreshold" evaluation_periods = "1" metric_name = local.kinesis_fatal_error_metric_name namespace = "AWS/Lambda" period = "60" statistic = "Average" threshold = "0" alarm_description = "This metric monitors fatal errors in the kinesis scaling lambda" insufficient_data_actions = [] dimensions = { FunctionName = aws_lambda_function.kinesis_scaling_function.function_name } }