#!/bin/bash
set -x -e
REGION=$(cat /tmp/aws-region)

IS_MASTER=false
if grep isMaster /mnt/var/lib/info/instance.json | grep true; then
  IS_MASTER=true
fi

if [ -f /var/log/trino/server.log ]; then
  CLUSTER_ID=$(jq -r .jobFlowId /mnt/var/lib/info/job-flow.json)

  # CPU Utilization Percent
  CPU_UTIL_PERCENT=$(echo $((100 - $(vmstat -n 1 2 | tail -1 | tr -s ' ' | cut -d ' ' -f 16))))

  if [ "$IS_MASTER" = true ]; then # Master node
    # Trino Metrics
    STATS_RESULT=$(trino-cli --execute 'select "abandonedqueries.totalcount", "canceledqueries.totalcount", "completedqueries.totalcount", "executiontime.fiveminutes.avg", "failedqueries.totalcount", "queuedqueries", "queuedtime.fiveminutes.avg", "runningqueries" from "trino.execution:name=querymanager"' --catalog jmx --schema current --output-format TSV)
    ABANDONED_QUERIES_TOTAL_COUNT=$(echo "$STATS_RESULT" | cut -d$'\t' -f1)
    CANCELED_QUERIES_TOTAL_COUNT=$(echo "$STATS_RESULT" | cut -d$'\t' -f2)
    COMPLETED_QUERIES_TOTAL_COUNT=$(echo "$STATS_RESULT" | cut -d$'\t' -f3)
    EXECUTION_TIME_FIVE_MINUTES_AVG=$(echo "$STATS_RESULT" | cut -d$'\t' -f4)
    FAILED_QUERIES_TOTAL_COUNT=$(echo "$STATS_RESULT" | cut -d$'\t' -f5)
    QUEUED_QUERIES=$(echo "$STATS_RESULT" | cut -d$'\t' -f6)
    QUEUED_TIME_FIVE_MINUTES_AVG=$(echo "$STATS_RESULT" | cut -d$'\t' -f7)
    RUNNING_QUERIES=$(echo "$STATS_RESULT" | cut -d$'\t' -f8)

    NUM_NODES=$(trino-cli --execute 'select "activecount" from "trino.failuredetector:name=heartbeatfailuredetector"' --catalog jmx --schema current --output-format TSV)

    # Subtract total stats queries from completed count
    # We fire 2 stats queries per trigger
    STATS_QUERY_COUNT=0
    if [ -f /tmp/trino_stats_query_count ]; then
      STATS_QUERY_COUNT=$(cat /tmp/trino_stats_query_count)
    fi
    STATS_QUERY_COUNT="$(($STATS_QUERY_COUNT + 2))"
    COMPLETED_QUERIES_TOTAL_COUNT="$(($COMPLETED_QUERIES_TOTAL_COUNT - $STATS_QUERY_COUNT))"
    echo $STATS_QUERY_COUNT >/tmp/trino_stats_query_count

    PREV_AVG_QUERY_TIME=0
    AVG_QUERY_TIME_INC=0
    if [ -f /tmp/trino_avg_query_time ]; then
      PREV_AVG_QUERY_TIME=$(cat /tmp/trino_avg_query_time)
    fi
    if [ ! "$PREV_AVG_QUERY_TIME" = 0 ]; then
      AVG_QUERY_TIME_INC=$(bc <<<"scale=2;100*($EXECUTION_TIME_FIVE_MINUTES_AVG-$PREV_AVG_QUERY_TIME)/$PREV_AVG_QUERY_TIME")
    fi
    echo $EXECUTION_TIME_FIVE_MINUTES_AVG >/tmp/trino_avg_query_time

    aws cloudwatch put-metric-data --metric-name MasterCpuUtilization --namespace AWS/ElasticMapReduce --unit Percent --value $CPU_UTIL_PERCENT --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumQueuedQueries --namespace AWS/ElasticMapReduce --unit Count --value $QUEUED_QUERIES --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoAvgQueryTime5m --namespace AWS/ElasticMapReduce --unit Milliseconds --value $EXECUTION_TIME_FIVE_MINUTES_AVG --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoAvgQueryTime5mInc --namespace AWS/ElasticMapReduce --unit Percent --value $AVG_QUERY_TIME_INC --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoAvgQueuedTime5m --namespace AWS/ElasticMapReduce --unit Milliseconds --value $QUEUED_TIME_FIVE_MINUTES_AVG --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumWorkerNodes --namespace AWS/ElasticMapReduce --unit Count --value $NUM_NODES --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumRunningQueries --namespace AWS/ElasticMapReduce --unit Count --value $RUNNING_QUERIES --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumCompletedQueries --namespace AWS/ElasticMapReduce --unit Count --value $COMPLETED_QUERIES_TOTAL_COUNT --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumAbandonedQueries --namespace AWS/ElasticMapReduce --unit Count --value $ABANDONED_QUERIES_TOTAL_COUNT --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumCanceledQueries --namespace AWS/ElasticMapReduce --unit Count --value $CANCELED_QUERIES_TOTAL_COUNT --dimensions JobFlowId=$CLUSTER_ID --region $REGION
    aws cloudwatch put-metric-data --metric-name TrinoNumFailedQueries --namespace AWS/ElasticMapReduce --unit Count --value $FAILED_QUERIES_TOTAL_COUNT --dimensions JobFlowId=$CLUSTER_ID --region $REGION

    # pump metrics into Ganglia if installed
    if [ -f /etc/ganglia/gmond.conf ]; then
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumQueuedQueries --value $QUEUED_QUERIES --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoAvgQueryTime5m --value $EXECUTION_TIME_FIVE_MINUTES_AVG --type float --unit=ms
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoAvgQueryTime5mInc --value $AVG_QUERY_TIME_INC --type float --unit=%
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoAvgQueuedTime5m --value $QUEUED_TIME_FIVE_MINUTES_AVG --type float --unit=ms
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumWorkerNodes --value $NUM_NODES --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumRunningQueries --value $RUNNING_QUERIES --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumCompletedQueries --value $COMPLETED_QUERIES_TOTAL_COUNT --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumAbandonedQueries --value $ABANDONED_QUERIES_TOTAL_COUNT --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumCanceledQueries --value $CANCELED_QUERIES_TOTAL_COUNT --type int32 --unit count
      /usr/bin/gmetric -c /etc/ganglia/gmond.conf --group trino --name TrinoNumFailedQueries --value $FAILED_QUERIES_TOTAL_COUNT --type int32 --unit count
    fi
  else # Worker nodes
    aws cloudwatch put-metric-data --metric-name WorkerCpuUtilization --namespace AWS/ElasticMapReduce --unit Percent --value $CPU_UTIL_PERCENT --dimensions JobFlowId=$CLUSTER_ID --region $REGION
  fi
fi