#!/usr/bin/bash -x cat >/home/ec2-user/send_metrics.py <= 100) }) ) # The code catchable_sigs = set(signal.Signals) - {signal.SIGKILL, signal.SIGSTOP, signal.SIGCHLD} for sig in catchable_sigs: print("handle %s" % sig) signal.signal(sig, signal_handler) instance_id = get_instance_id() # Get configs for ii in range(20): configs = get_ddb_configs(ddb_table,instance_id) if "HeartbeatToken" in configs: break print("No configs in DynamoDB, waiting") print(configs) time.sleep(5) if ii >= 19: print("Something went wrong. Terminate run") terminate_self_instance(ec2_client) sys.exit(1) # Duration until job completion in minutes (should be 2 < x < 15) job_duration_minutes = float(configs["JobDuration"]) # Time between checkpoints checkpoint_interval_minutes = float(configs["CheckpointDuration"]) job_id = configs["JobId"] heartbeat_token = configs["HeartbeatToken"] start_percentage = int(configs["Percentage"]) checkpoint_saved_percentage = int(configs["Percentage"]) sleep_duration_seconds = 60.0 * job_duration_minutes / 100.0 checkpoint_counter_seconds = 0.0 print("Starting job (duration %f min / checkpoint %f min)" % ( job_duration_minutes, checkpoint_interval_minutes )) put_cloudwatch_percentages(cw_client,start_percentage,start_percentage) put_ddb_saved_percentage(ddb_table,job_id,instance_id,start_percentage) for ii in range(start_percentage,100): time.sleep(sleep_duration_seconds) # checkpoint on time or interrupt notice checkpoint_counter_seconds += sleep_duration_seconds checkpoint_flag=((checkpoint_counter_seconds/60.0) > checkpoint_interval_minutes) print("%f%% complete - checkpoint=%s" % (ii+1,checkpoint_flag)) if checkpoint_flag: print("resetting flag") checkpoint_counter_seconds = 0.0 checkpoint_saved_percentage = ii+1 # record progress data that can be lost put_cloudwatch_percentages(cw_client,checkpoint_saved_percentage,ii+1) put_ddb_saved_percentage(ddb_table,job_id,instance_id,checkpoint_saved_percentage) send_task_hearbeat(sfn_client,heartbeat_token) # End on interrupt if check_interrupt_notice(): checkpoint_saved_percentage = ii+1 put_cloudwatch_percentages(cw_client,checkpoint_saved_percentage,ii+1) put_ddb_saved_percentage(ddb_table,job_id,instance_id,checkpoint_saved_percentage) send_task_success(sfn_client,heartbeat_token,checkpoint_saved_percentage) # Write final data put_cloudwatch_percentages(cw_client,100,100) put_ddb_saved_percentage(ddb_table,job_id,instance_id,100) send_task_success(sfn_client,heartbeat_token,100) # At completion suicide instance terminate_self_instance(ec2_client) EOT yum install -y jq pip3 install boto3 cat >/home/ec2-user/send_metrics <>/var/log/send_metrics.log & /usr/bin/date EOT chmod 755 /home/ec2-user/send_metrics touch /var/log/send_metrics.log chmod 666 /var/log/send_metrics.log #/usr/bin/bash -x /home/ec2-user/send_metrics cat >/lib/systemd/system/spot.service <