# frozen_string_literal: true # # Cookbook:: aws-parallelcluster-slurm # Recipe:: emit_chef_error_event # Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. require 'chef/handler' module WriteChefError # this class is used to handle chef errors and write the errors into a certain file for slurm scheduler class WriteComputeFleetSlurmChefError < Chef::Handler def report require 'date' error_file = node["cluster"]["bootstrap_error_path"] # get the failed action records using the chef function filtered_collection # reference: https://github.com/cinc-project/chef/blob/stable/cinc/lib/chef/action_collection.rb#L107 failed_action_collection = action_collection.filtered_collection( up_to_date: false, skipped: false, updated: false, failed: true, unprocessed: false ) failures = failed_action_collection.map { |action_record| get_failure_detail(action_record) }.compact error_info = get_error_info(node, failures) IO.write(error_file, error_info.to_json + "\n") # the 5s sleep time here will extend the overall sleep time set in the CLI repo: # cli/src/pcluster/resources/compute_node/user_data.sh, in order to allow CW agent enough time # to detect this new error log file, create the logstream and push the content to the logstream sleep(5) end def get_failure_detail(action_record) { "exception-type" => action_record.exception.class.name, "error-title" => action_record.error_description["title"], "nesting-level" => action_record.nesting_level, "cookbook-name" => action_record.new_resource.cookbook_name, "recipe-name" => action_record.new_resource.recipe_name, "source-line" => action_record.new_resource.source_line, "resource-name" => action_record.new_resource.name, "resource-type" => action_record.new_resource.declared_type, "action" => action_record.action, } end def get_error_info(node, failures) { "datetime" => DateTime.now, "version" => 0, "cluster-name" => node["cluster"]["cluster_name"], "scheduler" => node["cluster"]["scheduler"], "node-role" => "ComputeFleet", "level" => "ERROR", "instance-id" => node["ec2"]["instance_id"], "event-type" => "chef-recipe-exception", "message" => "Chef recipe exception", "component" => get_component(node.override_runlist), "compute" => { "name" => node["cluster"]["slurm_nodename"], "instance-id" => node["ec2"]["instance_id"], "instance-type" => node["ec2"]["instance_type"], "availability-zone" => node["ec2"]["availability_zone"], "address" => node["ipaddress"], "hostname" => node["ec2"]["hostname"], "queue-name" => node["cluster"]["scheduler_queue_name"], "compute-resource" => node["cluster"]["scheduler_compute_resource_name"], "node-type" => get_node_type(node["cluster"]["slurm_nodename"]), }, "detail" => { "failures" => failures, }, } end def get_node_type(node_name) if node_name.nil? nil else is_static_node?(node_name) ? "static" : "dynamic" end end def get_component(runlist) # get the component from node.override_runlist # match the "aws-parallelcluster::init" format # return one of these values: "init", "configure", "finalize" match = runlist[0].name.match(/^([a-z\-]+)::([a-z]+)$/) raise "Failed when parsing the runlist: #{runlist}" if match.nil? match[2] end end end