# Slurm configuration file
#
# You can create a config file using the configurator at:
# https://slurm.schedmd.com/configurator.html
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
# https://slurm.schedmd.com/slurm.conf.html
#
# This cluster is a high throughput cluster that submits jobs
# very quickly some of which are short.
# The following page documents configuration recommendations for a
# high throughput cluster:
# https://slurm.schedmd.com/high_throughput.html
# Configuration changes on slurmctl
# * /proc/sys/fs/file-max: Recommended limit 0f 32,832. Current value: 39,148,160
# * /proc/sys/net/ipv4/tcp_max_syn_backlog: Current value: 2048. Increased to 4096
# * /proc/sys/net/ipv4/tcp_syncookies: must be set to 1. Is.
# * /proc/sys/net/ipv4/tcp_synack_retries: Set to 5 as recommended
# * /proc/sys/net/core/somaxconn: Defaults to 128. Increase to 4096.
# * ifconfig eth0 txqueuelen 4096 (up from 1000)

ClusterName={{ClusterName}}
{% for i in range(1, NumberOfControllers|int + 1) %}
SlurmctldHost = {{SlurmCtlBaseHostname + i|string}}.{{Domain}}
{% endfor %}

# CommunicationParameters:
# NoAddrCache: Do not assume that nodes will retain their IP addresses. Do not cache node->ip mapping
CommunicationParameters = NoAddrCache
#
Epilog={{SlurmScriptsDir}}/epilog.sh
EpilogSlurmctld={{SlurmScriptsDir}}/slurmctld-epilog.sh
# JobRequeue must be set to 1 to enable preemption to requeue jobs.
JobRequeue=1
JobSubmitPlugins=defaults
LaunchParameters=enable_nss_slurm
MpiDefault=none
ProctrackType=proctrack/cgroup
#
# PrologFlags:
# * Alloc: If set, the Prolog script will be executed at job allocation.
#   By default, Prolog is executed just before the task is launched. Therefore, when
#   salloc is started, no Prolog is executed. Alloc is useful for preparing things
#   before a user starts to use any allocated resources. In particular, this flag is
#   needed on a Cray system when cluster compatibility mode is enabled.
#   NOTE: Use of the Alloc flag will increase the time required to start jobs.
# * Contain: At job allocation time, use the ProcTrack plugin to create a job container
#   on all allocated compute nodes. This container may be used for user processes not
#   launched under Slurm control, for example pam_slurm_adopt may place processes
#   launched through a direct user login into this container. If using pam_slurm_adopt,
#   then ProcTrackType must be set to either proctrack/cgroup or proctrack/cray_aries.
#   Setting the Contain implicitly sets the Alloc flag.
# * X11: Enable Slurm's built-in X11 forwarding capabilities.
#   This is incompatible with ProctrackType=proctrack/linuxproc.
#   Setting the X11 flag implicitly enables both Contain and Alloc flags as well.
PrologFlags=Contain,X11
Prolog={{SlurmScriptsDir}}/prolog.sh
PrologSlurmctld={{SlurmScriptsDir}}/slurmctld-prolog.sh
# ReturnToService
# 0: Make node available if it registers with a valid configuration regardless of why it is down.
# 1: Only return to service if DOWN due to being non-responsive.
# 2: A DOWN node will become available for use upon registration with a valid configuration.
ReturnToService=2
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
SrunEpilog={{SlurmScriptsDir}}/srun-epilog.sh
SrunProlog={{SlurmScriptsDir}}/srun-prolog.sh
StateSaveLocation={{SlurmSpoolDir}}
SwitchType=switch/none
TaskEpilog={{SlurmScriptsDir}}/task-epilog.sh
TaskPlugin=task/cgroup
TaskProlog={{SlurmScriptsDir}}/task-prolog.sh
# TreeWidth is set to the maximum for cloud clusters so messages go directly between controller and nodes.
# https://slurm.schedmd.com/elastic_computing.html
TreeWidth = 65533
#
#
#
# TIMERS
InactiveLimit=0
KillWait=30
#
# MessageTimeout
# Prevent error: slurm_receive_msgs: Socket timed out on send/recv operation
# See https://bugs.schedmd.com/show_bug.cgi?id=1806
# Default is 10s. Increase to 30s first and then 60s if that doesn't solve the problem.
# Still get timeouts with 60. Blast it to 180.
MessageTimeout=60
MinJobAge=300
SlurmctldTimeout=300
SlurmdTimeout=600
Waittime=0
#
#
# SCHEDULING
#
# SchedulerType
# * backfill: (default) Augments default FIFO scheduling. Dependent on users specifying job time limits
#      Backfill scheduling is a time consuming operation.
# * builtin: Strict priority order within each partition
# builtin is more light weight, but when using that then job scheduling is strictly FIFO so if a job
# is pending for license, ICE, etc. then lower priority jobs will also pend.
SchedulerType=sched/backfill
#
# SchedulerParameters
# batch_sched_delay: How long, in seconds, scheduling of batch jobs can be delayed.
#     Required in high throughput cluster.
# bf_continue: cause the backfill scheduler to continue processing pending jobs
#     from its original job list after releasing locks even if job or node state changes
# bf_interval: The number of seconds between backfill iterations. Higher values
#     result in less overhead and better responsiveness.
# bf_licenses: Require the backfill scheduling logic to track and plan for license availability.
#     By default, any job blocked on license availability will not have resources reserved which can lead to job starvation.
#     This option implicitly enables bf_running_job_reserve.
# bf_max_job_test: The maximum number of jobs to attempt backfill scheduling for.
# bf_max_job_user: The maximum number of jobs per user to attempt starting with
#     the backfill scheduler for ALL partitions. One can set this limit to prevent
#     users from flooding the backfill queue with jobs that cannot start and that
#     prevent jobs from other users to start.
#     Set bf_max_job_test to a value much higher than bf_max_job_user.
# bf_max_time: The maximum time in seconds the backfill scheduler can spend.
#     Default: bf_interval
# bf_yield_interval: The backfill scheduler will periodically relinquish locks in
#     order for other pending operations to take place. This specifies the times
#     when the locks are relinquished in microseconds. Smaller values may be helpful
#     for high throughput computing when used in conjunction with the bf_continue option.
#     Default: 2,000,000 (2 sec)
# bf_yield_sleep: The length of time for which the locks are relinquished in microseconds.
# default_queue_depth: The default number of jobs to attempt scheduling (i.e. the queue depth)
#     when a running job completes or other routine actions occur.
#      The full queue will be tested on a less frequent basis as defined by the sched_interval option described below.
#     If set to default of 100 then it will stop scheduling jobs and
#     starting new nodes for them and jobs will get stuck in PENDING state.
# defer: Setting this option will avoid attempting to schedule each job individually at job submit time.
#     This breaks srun.
# max_rpc_cnt: Helps prevent message timeouts
# sched_min_internal: How frequently, in microseconds, the main scheduling loop will execute and test any pending jobs.
#     This is set to 2s because this is a high thoroughput cluster and setting it higher
#     keeps the cluster responsive to incoming requests.
SchedulerParameters=\
batch_sched_delay=10\
,bf_continue\
,bf_interval=30\
,bf_licenses\
,bf_max_job_test=500\
,bf_max_job_user=0\
,bf_yield_interval=1000000\
,default_queue_depth=10000\
,max_rpc_cnt=100\
,preempt_youngest_first\
,sched_min_internal=2000000
#
# SelectType:
# Identifies the type of resource selection algorithm to be used. A restart of
# slurmctld and slurmd is required for changes to this parameter to take effect.
# When changed, all job information (running and pending) will be lost, since
# the job state save format used by each plugin is different. The only exception
# to this is when changing from cons_res to cons_tres or from cons_tres to cons_res.
# However, if a job contains cons_tres-specific features and then SelectType is
# changed to cons_res, the job will be canceled, since there is no way for
# cons_res to satisfy requirements specific to cons_tres.
#
# Acceptable values include:
# select/cons_res
#     The resources (cores and memory) within a node are individually allocated as
#     consumable resources. Note that whole nodes can be allocated to jobs for
#     selected partitions by using the OverSubscribe=Exclusive option. See the
#     partition OverSubscribe parameter for more information.
# select/cons_tres
#     The resources (cores, memory, GPUs and all other trackable resources)
#     within a node are individually allocated as consumable resources. Note that
#     whole nodes can be allocated to jobs for selected partitions by using the
#     OverSubscribe=Exclusive option. See the partition OverSubscribe parameter for more information.
# select/cray_aries
#     for a Cray system. The default value is "select/cray_aries" for all Cray systems.
# select/linear
#     for allocation of entire nodes assuming a one-dimensional array of nodes in
#     which sequentially ordered nodes are preferable. For a heterogeneous cluster
#     (e.g. different CPU counts on the various nodes), resource allocations will
#     favor nodes with high CPU counts as needed based upon the job's node and CPU
#     specification if TopologyPlugin=topology/none is configured. Use of other
#     topology plugins with select/linear and heterogeneous nodes is not recommended
#     and may result in valid job allocation requests being rejected. The linear
#     plugin is not designed to track generic resources on a node. In cases where
#     generic resources (such as GPUs) need to be tracked, the cons_res or cons_tres
#     plugins should be used instead. This is the default value.
SelectType = select/cons_tres
#
# SelectTypeParameters
# The permitted values of SelectTypeParameters depend upon the configured value of SelectType.
# The only supported options for SelectType=select/linear are CR_ONE_TASK_PER_CORE
# and CR_Memory, which treats memory as a consumable resource and prevents
# memory over subscription with job preemption or gang scheduling. By default
# SelectType=select/linear allocates whole nodes to jobs without considering
# their memory consumption. By default SelectType=select/cons_res,
# SelectType=select/cray_aries, and SelectType=select/cons_tres, use CR_Core_Memory,
# which allocates Core to jobs with considering their memory consumption.
#
# A restart of slurmctld is required for changes to this parameter to take effect.
#
# The following options are supported for SelectType=select/cray_aries: OTHER_CONS_RES, OTHER_CONS_TRES
#
# The following options are supported by the SelectType=select/cons_res and SelectType=select/cons_tres plugins:
# CR_CPU
#     CPUs are consumable resources. Configure the number of CPUs on each node,
#     which may be equal to the count of cores or hyper-threads on the node
#     depending upon the desired minimum resource allocation. The node's Boards,
#     Sockets, CoresPerSocket and ThreadsPerCore may optionally be configured
#     and result in job allocations which have improved locality; however doing
#     so will prevent more than one job from being allocated on each core.
# CR_CPU_Memory
#     CPUs and memory are consumable resources. Configure the number of CPUs on
#     each node, which may be equal to the count of cores or hyper-threads on
#     the node depending upon the desired minimum resource allocation. The node's
#     Boards, Sockets, CoresPerSocket and ThreadsPerCore may optionally be configured
#     and result in job allocations which have improved locality; however doing so
#     will prevent more than one job from being allocated on each core. Setting
#     a value for DefMemPerCPU is strongly recommended.
# CR_Core
#     Cores are consumable resources. On nodes with hyper-threads, each thread
#     is counted as a CPU to satisfy a job's resource requirement, but multiple
#     jobs are not allocated threads on the same core. The count of CPUs allocated
#     to a job is rounded up to account for every CPU on an allocated core. This
#     will also impact total allocated memory when --mem-per-cpu is used to be
#     multiply of total number of CPUs on allocated cores.
# CR_Core_Memory
#     Cores and memory are consumable resources. On nodes with hyper-threads,
#     each thread is counted as a CPU to satisfy a job's resource requirement,
#     but multiple jobs are not allocated threads on the same core. The count of
#     CPUs allocated to a job may be rounded up to account for every CPU on an
#     allocated core. Setting a value for DefMemPerCPU is strongly recommended.
# CR_ONE_TASK_PER_CORE
#     Allocate one task per core by default. Without this option, by default
#     one task will be allocated per thread on nodes with more than one ThreadsPerCore
#     configured. NOTE: This option cannot be used with CR_CPU*.
# CR_CORE_DEFAULT_DIST_BLOCK
#     Allocate cores within a node using block distribution by default. This
#     is a pseudo-best-fit algorithm that minimizes the number of boards and
#     minimizes the number of sockets (within minimum boards) used for the allocation.
#     This default behavior can be overridden specifying a particular "-m" parameter
#     with srun/salloc/sbatch. Without this option, cores will be allocated cyclically
#     across the sockets.
# CR_LLN
#     Schedule resources to jobs on the least loaded nodes (based upon the number
#     of idle CPUs). This is generally only recommended for an environment with
#     serial jobs as idle resources will tend to be highly fragmented, resulting
#     in parallel jobs being distributed across many nodes. Note that node Weight
#     takes precedence over how many idle resources are on each node. Also see
#     the partition configuration parameter LLN use the least loaded nodes in selected partitions.
# CR_Pack_Nodes
#     If a job allocation contains more resources than will be used for launching
#     tasks (e.g. if whole nodes are allocated to a job), then rather than
#     distributing a job's tasks evenly across its allocated nodes, pack them
#     as tightly as possible on these nodes. For example, consider a job allocation
#     containing two entire nodes with eight CPUs each. If the job starts ten tasks
#     across those two nodes without this option, it will start five tasks on each
#     of the two nodes. With this option, eight tasks will be started on the first
#     node and two tasks on the second node. This can be superseded by "NoPack"
#     in srun's "--distribution" option. CR_Pack_Nodes only applies when the "block"
#     task distribution method is used.
# CR_Socket
#     Sockets are consumable resources. On nodes with multiple cores, each core
#     or thread is counted as a CPU to satisfy a job's resource requirement, but
#     multiple jobs are not allocated resources on the same socket.
# CR_Socket_Memory
#     Memory and sockets are consumable resources. On nodes with multiple cores,
#     each core or thread is counted as a CPU to satisfy a job's resource requirement,
#     but multiple jobs are not allocated resources on the same socket. Setting
#     a value for DefMemPerCPU is strongly recommended.
# CR_Memory
#     Memory is a consumable resource. NOTE: This implies OverSubscribe=YES or
#     OverSubscribe=FORCE for all partitions. Setting a value for DefMemPerCPU
#     is strongly recommended.
#
#     NOTE: If memory isn't configured as a consumable resource (CR_CPU,
#     CR_Core or CR_Socket without _Memory) memory can be oversubscribed.
#     In this case the --mem option is only used to filter out nodes with lower
#     configured memory and does not take running jobs into account. For instance,
#     two jobs requesting all the memory of a node can run at the same time.
SelectTypeParameters = CR_CPU_MEMORY
#
# Enable the use of scrontab to submit and manage periodic repeating jobs.
ScronParameters = enable
#
# Preemption
# https://slurm.schedmd.com/preempt.html
# PreemptMode:
#     Values: OFF, CANCEL, GANG, REQUEUE, SUSPEND
PreemptMode = {{PreemptMode}}
PreemptType = {{PreemptType}}
PreemptExemptTime = {{PreemptExemptTime}}
#
# JOB PRIORITY
#
# PriorityType
# Default is priority/basic
# Using multifactor so the interactive partition has priority over the regress partitiion
# https://slurm.schedmd.com/priority_multifactor.html
{% if UseAccountingDatabase %}
# Multifactor uses 9 factors:
# * age
# * association
# * fair-share
# * job size
# * nice
# * partition
# * QOS
# * site
# * TRES
PriorityType=priority/multifactor
PriorityWeightPartition=100000
PriorityWeightFairshare=10000
PriorityWeightQOS=10000
PriorityWeightAge=1000
PriorityWeightAssoc=0
PriorityWeightJobSize=0
{% else %}
PriorityType=priority/basic
{% endif %}
#
#
# LOGGING
#
# SlurmctldDebug: quiet, fatal, error, info, verbose, debug, debug2, debug3, debug4, debug5
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
#
# SlurmctldDebug: quiet, fatal, error, info, verbose, debug, debug2, debug3, debug4, debug5
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
#
# DebugFlags: Defines specific subsystems which should provide more detailed event logging.
# NO_CONF_HASH: Do not log when the slurm.conf files differ between Slurm daemons
# Script: Debug info regarding the process that runs slurmctld scripts such as PrologSlurmctld and EpilogSlurmctl
DebugFlags=\
NO_CONF_HASH\
,Script\
,Elasticsearch\
,Federation\
,License
#
#
#
# ACCOUNTING
#
{% if UseAccountingDatabase %}
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{AccountingStorageHost}}
AccountingStoragePort=6819
#
# AccountingStorageTRES
# Comma-separated list of resources you wish to track on the cluster.
# By default Billing, CPU, Energy, Memory, Node, FS/Disk, Pages and VMem are tracked.
#AccountingStorageTRES=
#
# AccountingStoreFlags
# Comma separated list used to tell the slurmctld to store extra fields that may be more heavy weight than the normal job information.
# job_comment: Include the job's comment field in the job complete message sent to the Accounting Storage database.
# job_env:
# job_script:
AccountingStoreFlags=job_comment
JobAcctGatherType = jobacct_gather/linux
{% else %}
AccountingStorageType=accounting_storage/none
{% endif %}
#
#
# JobCompType: The job completion logging mechanism type.
#     The information captured is redundant with the Accounting database
#     configured by AccountingStorageType.
# jobcomp/none: Upon job completion, a record of the job is purged from the system.
# jobcomp/elasticsearch:
#     Upon job completion, a record of the job should be written to an Elasticsearch
#     server, specified by the JobCompLoc parameter.
#     See https://slurm.schedmd.com/elasticsearch.html
# jobcomp/filetxt: Upon job completion, a record of the job should be written to a
#     text file, specified by the JobCompLoc parameter.
# jobcomp/lua: a record of the job should be processed by the jobcomp.lua scrip
# jobcomp/mysql: A record of the job should be written to a MySQL or MariaDB
#     database, specified by the JobCompLoc parameter.
# jobcomp/script: Upon job completion, a script specified by the JobCompLoc parameter
#     is to be executed with environment variables providing the job information.
JobCompType={{JobCompType}}
{% if JobCompType == "jobcomp/elasticsearch" %}
JobCompLoc = {{JobCompLoc}}
JobCompParams = timeout=5,connect_timeout=5
{% endif %}
{% if JobCompType == "jobcomp/filetxt" %}
JobCompLoc={{SlurmAccountingDir}}/job_accounting.txt
{% endif %}
#

#
# SlurmctldParameters
# cloud_dns: Do not set cloud_dns unless joining the domain and adding nodes to DNS
#     Joining the domain overloads the DCs when lots of nodes are started so
#     not joining the domain or using cloud_dns.
# idle_on_node_suspend: Mark nodes as idle, regardless of current state, when suspending
#     nodes with SuspendProgram so that nodes will be eligible to be resumed at a later time.
# node_reg_mem_percent: Percentage of memory a node is allowed to register with without being marked as invalid with low memory.
#     Use this so can configure nodes with 100% of their memory.
#     Without this option the node will fail because the system uses some of the memory.
SlurmctldParameters=\
idle_on_node_suspend,\
,node_reg_mem_percent=90
#
# Allow users to see state of nodes that are powered down
PrivateData = cloud
#
FederationParameters=fed_display
#
#
# POWER SAVE SUPPORT FOR IDLE NODES
#
{% if SuspendAction == "stop" %}
SuspendProgram = {{SlurmScriptsDir}}/slurm_ec2_stop.py
{% else %}
SuspendProgram = {{SlurmScriptsDir}}/slurm_ec2_terminate.py
{% endif %}
ResumeProgram = {{SlurmScriptsDir}}/slurm_ec2_resume.py
ResumeFailProgram = {{SlurmScriptsDir}}/slurm_ec2_resume_fail.py
# Maximum time between when a node is suspended and when suspend is complete.
# At that time it should be ready to be resumed.
SuspendTimeout = 60
ResumeTimeout = 600
# Number of nodes per minute that can be resumed or suspended
ResumeRate = 300
SuspendRate = 60
# Time that a node has to be idle or down before being suspended
# Should be >= (SuspendTimeout + ResumeTimeout)
SuspendTime = 660

# Configure slurmrestd
AuthAltTypes = auth/jwt
AuthAltParameters = jwt_key={{SlurmSpoolDir}}/jwt_hs256.key

include slurm_licenses.conf
include slurm_nodes.conf
{% if ON_PREM_COMPUTE_NODES_CONFIG is defined %}
include {{ON_PREM_COMPUTE_NODES_CONFIG}}
{% endif %}
include slurm_tres.conf
{% if SLURM_CONF_OVERRIDES_PATH is defined %}
include {{SLURM_CONF_OVERRIDES_PATH}}
{% endif %}