# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. AWSTemplateFormatVersion: '2010-09-09' Description: "Cloudformation Stack template to create the Cloudwatch dashboard for the Kinesis Analytics Flink Application" Parameters: ApplicationName: Description: "The Kinesis Analytics application to monitor the cloudwatch metrics in dashboard" Type: "String" KDARegion: Description: "The region where the Kinesis Analytics application is created" Type: "String" Default: "us-east-1" KinesisStreamName: Description: "The Kinesis stream name which is configured as input in Kinesis Analytics application" Type: "String" Default: "None" DashboardName: Description: "The CloudWatch dashboard name to create" Type: "String" Default: "KDAAdvancedMonitoringDashboard" Resources: CloudwatchDashboard: Type: "AWS::CloudWatch::Dashboard" Properties: DashboardName: !Ref DashboardName DashboardBody: !Sub - | {"widgets":[{"type":"metric","x":6,"y":1,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","downtime","Application","${ApplicationName}"]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"Downtime","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"DowntimeThreshold","value":0}]}}},{"type":"metric","x":0,"y":1,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","uptime","Application","${ApplicationName}",{"label":"uptime"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Minimum","title":"Uptime","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":18,"y":1,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","fullRestarts","Application","${ApplicationName}",{"label":"fullRestarts"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"FlinkJobRestarts","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":12,"y":1,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","numberOfFailedCheckpoints","Application","${ApplicationName}",{"label":"numberOfFailedCheckpoints"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"NumberofFailedCheckpoints","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"NumberofFailedCheckpointsThreshold","value":0}]}}},{"type":"metric","x":12,"y":7,"width":12,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","lastCheckpointDuration","Application","${ApplicationName}"]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"LastCheckpointDuration","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":0,"y":7,"width":12,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","lastCheckpointSize","Application","${ApplicationName}",{"label":"lastCheckpointSize"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Average","title":"LastCheckpointSize","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":0,"y":20,"width":12,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","cpuUtilization","Application","${ApplicationName}",{"label":"cpuUtilization"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"CPUUtilization","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"CPUUtilizationThreshold","value":80}]}}},{"type":"metric","x":12,"y":20,"width":12,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","heapMemoryUtilization","Application","${ApplicationName}",{"label":"heapMemoryUtilization"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"HeapMemoryUtilization","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"HeapMemoryUtilizationThreshold","value":90}]}}},{"type":"metric","x":15,"y":26,"width":9,"height":6,"properties":{"metrics":[[{"expression":"(m1*100)/60000","label":"OldGenerationGCTimePercent","id":"e1","region":"${KDARegion}"}],["AWS/KinesisAnalytics","oldGenerationGCTime","Application","${ApplicationName}",{"label":"oldGenerationGCTime","id":"m1","visible":false}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"OldGenerationGCPercent(Over1Min)","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"GCPercentThreshold","value":60}]}}},{"type":"metric","x":0,"y":26,"width":9,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","threadsCount","Application","${ApplicationName}",{"label":"threadsCount"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"ThreadCount","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":9,"y":26,"width":6,"height":6,"properties":{"metrics":[[{"expression":"RATE(METRICS())*60","label":"OldGenerationGCCountRate","id":"e1"}],["AWS/KinesisAnalytics","oldGenerationGCCount","Application","${ApplicationName}",{"label":"","id":"m1","visible":false}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"OldGenerationGCCountRate","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":0,"y":33,"width":24,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","numRecordsInPerSecond","Application","${ApplicationName}",{"label":"numRecordsInPerSecond"}],[".","numRecordsOutPerSecond",".",".",{"yAxis":"left"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Average","title":"IncomingandOutgoingRecords(PerSecond)","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":18,"y":39,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","numLateRecordsDropped","Application","${ApplicationName}",{"label":"numLateRecordsDropped"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"LateRecordsDropped","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":0,"y":39,"width":12,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","currentInputWatermark","Application","${ApplicationName}",{"label":"currentInputWatermark"}],[".","currentOutputWatermark",".","."]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Minimum","title":"InputWatermarkandOutputWatermark","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":12,"y":39,"width":6,"height":6,"properties":{"metrics":[[{"expression":"m1-m2","label":"EventTimeLatency","id":"e1"}],["AWS/KinesisAnalytics","currentOutputWatermark","Application","${ApplicationName}",{"label":"currentOutputWatermark","id":"m1","visible":false}],[".","currentInputWatermark",".",".",{"label":"currentInputWatermark","id":"m2","visible":false}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Minimum","title":"EventTimeLatency","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}}}},{"type":"metric","x":0,"y":46,"width":6,"height":6,"properties":{"metrics":[["AWS/KinesisAnalytics","millisBehindLatest","Id","${KinesisStreamNameUnderscore}","Application","${ApplicationName}","Flow","Input", {"label":"millisBehindLatest"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","period":60,"stat":"Maximum","title":"KinesisMillisBehindLatest","yAxis":{"left":{"showUnits":false},"right":{"showUnits":false}},"annotations":{"horizontal":[{"label":"MillisBehindLatestThreshold","value":60000}]}}},{"type":"text","x":0,"y":45,"width":24,"height":1,"properties":{"markdown":"\n # **Kinesis Source and Sink Metrics**\n"}},{"type":"text","x":0,"y":0,"width":24,"height":1,"properties":{"markdown":"\n# **Application Health**\n"}},{"type":"text","x":0,"y":19,"width":24,"height":1,"properties":{"markdown":"\n# **Resource Utilization**\n"}},{"type":"text","x":0,"y":32,"width":24,"height":1,"properties":{"markdown":"\n# **Flink Application Progress**\n"}},{"type":"metric","x":6,"y":46,"width":6,"height":6,"properties":{"view":"timeSeries","stacked":false,"metrics":[["KinesisProducerLibrary","RetriesPerRecord","StreamName","${KinesisStreamName}"]],"region":"${KDARegion}"}},{"type":"metric","x":12,"y":46,"width":6,"height":6,"properties":{"view":"timeSeries","stacked":false,"metrics":[["KinesisProducerLibrary","UserRecordsPending","StreamName","${KinesisStreamName}"]],"region":"${KDARegion}"}},{"type":"metric","x":18,"y":46,"width":6,"height":6,"properties":{"view":"timeSeries","stacked":false,"metrics":[["KinesisProducerLibrary","BufferingTime","StreamName","${KinesisStreamName}"]],"region":"${KDARegion}"}},{"type":"metric","x":0,"y":52,"width":12,"height":6,"properties":{"metrics":[["AWS/Kinesis","IncomingRecords","StreamName","${KinesisStreamName}"],[".","GetRecords.IteratorAgeMilliseconds",".",".",{"stat":"Maximum","yAxis":"right"}]],"view":"timeSeries","stacked":false,"region":"${KDARegion}","stat":"Sum","period":60}},{"type":"metric","x":12,"y":52,"width":12,"height":6,"properties":{"view":"timeSeries","stacked":false,"metrics":[["AWS/Kinesis","ReadProvisionedThroughputExceeded","StreamName","${KinesisStreamName}"],[".","WriteProvisionedThroughputExceeded",".","."]],"region":"${KDARegion}"}},{"type":"log","x":0,"y":13,"width":24,"height":6,"properties":{"query":"SOURCE \"/aws/kinesis-analytics/${ApplicationName}\"| fields @timestamp, @message\n | sort @timestamp desc\n |limit 20","region":"${KDARegion}","stacked":false,"view":"table"}}]} - { KinesisStreamNameUnderscore: !Join ["_", !Split ["-", !Ref KinesisStreamName]] } Outputs: CloudwatchDashboard: Description: "Dashboard created to monitor the Kinesis Analytics Flink Application" Value: !Sub | "https://${AWS::Region}.console.aws.amazon.com/cloudwatch/home#dashboards:name=${DashboardName}"