# Slurm configuration file # # You can create a config file using the configurator at: # https://slurm.schedmd.com/configurator.html # # slurm.conf file generated by configurator.html. # # See the slurm.conf man page for more information. # https://slurm.schedmd.com/slurm.conf.html # # This cluster is a high throughput cluster that submits jobs # very quickly some of which are short. # The following page documents configuration recommendations for a # high throughput cluster: # https://slurm.schedmd.com/high_throughput.html # Configuration changes on slurmctl # * /proc/sys/fs/file-max: Recommended limit 0f 32,832. Current value: 39,148,160 # * /proc/sys/net/ipv4/tcp_max_syn_backlog: Current value: 2048. Increased to 4096 # * /proc/sys/net/ipv4/tcp_syncookies: must be set to 1. Is. # * /proc/sys/net/ipv4/tcp_synack_retries: Set to 5 as recommended # * /proc/sys/net/core/somaxconn: Defaults to 128. Increase to 4096. # * ifconfig eth0 txqueuelen 4096 (up from 1000) ClusterName={{ClusterName}} {% for i in range(1, NumberOfControllers|int + 1) %} SlurmctldHost = {{SlurmCtlBaseHostname + i|string}}.{{Domain}} {% endfor %} # CommunicationParameters: # NoAddrCache: Do not assume that nodes will retain their IP addresses. Do not cache node->ip mapping CommunicationParameters = NoAddrCache # Epilog={{SlurmScriptsDir}}/epilog.sh EpilogSlurmctld={{SlurmScriptsDir}}/slurmctld-epilog.sh # JobRequeue must be set to 1 to enable preemption to requeue jobs. JobRequeue=1 JobSubmitPlugins=defaults LaunchParameters=enable_nss_slurm MpiDefault=none ProctrackType=proctrack/cgroup # # PrologFlags: # * Alloc: If set, the Prolog script will be executed at job allocation. # By default, Prolog is executed just before the task is launched. Therefore, when # salloc is started, no Prolog is executed. Alloc is useful for preparing things # before a user starts to use any allocated resources. In particular, this flag is # needed on a Cray system when cluster compatibility mode is enabled. # NOTE: Use of the Alloc flag will increase the time required to start jobs. # * Contain: At job allocation time, use the ProcTrack plugin to create a job container # on all allocated compute nodes. This container may be used for user processes not # launched under Slurm control, for example pam_slurm_adopt may place processes # launched through a direct user login into this container. If using pam_slurm_adopt, # then ProcTrackType must be set to either proctrack/cgroup or proctrack/cray_aries. # Setting the Contain implicitly sets the Alloc flag. # * X11: Enable Slurm's built-in X11 forwarding capabilities. # This is incompatible with ProctrackType=proctrack/linuxproc. # Setting the X11 flag implicitly enables both Contain and Alloc flags as well. PrologFlags=Contain,X11 Prolog={{SlurmScriptsDir}}/prolog.sh PrologSlurmctld={{SlurmScriptsDir}}/slurmctld-prolog.sh # ReturnToService # 0: Make node available if it registers with a valid configuration regardless of why it is down. # 1: Only return to service if DOWN due to being non-responsive. # 2: A DOWN node will become available for use upon registration with a valid configuration. ReturnToService=2 SlurmctldPidFile=/var/run/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/var/run/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm SrunEpilog={{SlurmScriptsDir}}/srun-epilog.sh SrunProlog={{SlurmScriptsDir}}/srun-prolog.sh StateSaveLocation={{SlurmSpoolDir}} SwitchType=switch/none TaskEpilog={{SlurmScriptsDir}}/task-epilog.sh TaskPlugin=task/cgroup TaskProlog={{SlurmScriptsDir}}/task-prolog.sh # TreeWidth is set to the maximum for cloud clusters so messages go directly between controller and nodes. # https://slurm.schedmd.com/elastic_computing.html TreeWidth = 65533 # # # # TIMERS InactiveLimit=0 KillWait=30 # # MessageTimeout # Prevent error: slurm_receive_msgs: Socket timed out on send/recv operation # See https://bugs.schedmd.com/show_bug.cgi?id=1806 # Default is 10s. Increase to 30s first and then 60s if that doesn't solve the problem. # Still get timeouts with 60. Blast it to 180. MessageTimeout=60 MinJobAge=300 SlurmctldTimeout=300 SlurmdTimeout=600 Waittime=0 # # # SCHEDULING # # SchedulerType # * backfill: (default) Augments default FIFO scheduling. Dependent on users specifying job time limits # Backfill scheduling is a time consuming operation. # * builtin: Strict priority order within each partition # builtin is more light weight, but when using that then job scheduling is strictly FIFO so if a job # is pending for license, ICE, etc. then lower priority jobs will also pend. SchedulerType=sched/backfill # # SchedulerParameters # batch_sched_delay: How long, in seconds, scheduling of batch jobs can be delayed. # Required in high throughput cluster. # bf_continue: cause the backfill scheduler to continue processing pending jobs # from its original job list after releasing locks even if job or node state changes # bf_interval: The number of seconds between backfill iterations. Higher values # result in less overhead and better responsiveness. # bf_licenses: Require the backfill scheduling logic to track and plan for license availability. # By default, any job blocked on license availability will not have resources reserved which can lead to job starvation. # This option implicitly enables bf_running_job_reserve. # bf_max_job_test: The maximum number of jobs to attempt backfill scheduling for. # bf_max_job_user: The maximum number of jobs per user to attempt starting with # the backfill scheduler for ALL partitions. One can set this limit to prevent # users from flooding the backfill queue with jobs that cannot start and that # prevent jobs from other users to start. # Set bf_max_job_test to a value much higher than bf_max_job_user. # bf_max_time: The maximum time in seconds the backfill scheduler can spend. # Default: bf_interval # bf_yield_interval: The backfill scheduler will periodically relinquish locks in # order for other pending operations to take place. This specifies the times # when the locks are relinquished in microseconds. Smaller values may be helpful # for high throughput computing when used in conjunction with the bf_continue option. # Default: 2,000,000 (2 sec) # bf_yield_sleep: The length of time for which the locks are relinquished in microseconds. # default_queue_depth: The default number of jobs to attempt scheduling (i.e. the queue depth) # when a running job completes or other routine actions occur. # The full queue will be tested on a less frequent basis as defined by the sched_interval option described below. # If set to default of 100 then it will stop scheduling jobs and # starting new nodes for them and jobs will get stuck in PENDING state. # defer: Setting this option will avoid attempting to schedule each job individually at job submit time. # This breaks srun. # max_rpc_cnt: Helps prevent message timeouts # sched_min_internal: How frequently, in microseconds, the main scheduling loop will execute and test any pending jobs. # This is set to 2s because this is a high thoroughput cluster and setting it higher # keeps the cluster responsive to incoming requests. SchedulerParameters=\ batch_sched_delay=10\ ,bf_continue\ ,bf_interval=30\ ,bf_licenses\ ,bf_max_job_test=500\ ,bf_max_job_user=0\ ,bf_yield_interval=1000000\ ,default_queue_depth=10000\ ,max_rpc_cnt=100\ ,preempt_youngest_first\ ,sched_min_internal=2000000 # # SelectType: # Identifies the type of resource selection algorithm to be used. A restart of # slurmctld and slurmd is required for changes to this parameter to take effect. # When changed, all job information (running and pending) will be lost, since # the job state save format used by each plugin is different. The only exception # to this is when changing from cons_res to cons_tres or from cons_tres to cons_res. # However, if a job contains cons_tres-specific features and then SelectType is # changed to cons_res, the job will be canceled, since there is no way for # cons_res to satisfy requirements specific to cons_tres. # # Acceptable values include: # select/cons_res # The resources (cores and memory) within a node are individually allocated as # consumable resources. Note that whole nodes can be allocated to jobs for # selected partitions by using the OverSubscribe=Exclusive option. See the # partition OverSubscribe parameter for more information. # select/cons_tres # The resources (cores, memory, GPUs and all other trackable resources) # within a node are individually allocated as consumable resources. Note that # whole nodes can be allocated to jobs for selected partitions by using the # OverSubscribe=Exclusive option. See the partition OverSubscribe parameter for more information. # select/cray_aries # for a Cray system. The default value is "select/cray_aries" for all Cray systems. # select/linear # for allocation of entire nodes assuming a one-dimensional array of nodes in # which sequentially ordered nodes are preferable. For a heterogeneous cluster # (e.g. different CPU counts on the various nodes), resource allocations will # favor nodes with high CPU counts as needed based upon the job's node and CPU # specification if TopologyPlugin=topology/none is configured. Use of other # topology plugins with select/linear and heterogeneous nodes is not recommended # and may result in valid job allocation requests being rejected. The linear # plugin is not designed to track generic resources on a node. In cases where # generic resources (such as GPUs) need to be tracked, the cons_res or cons_tres # plugins should be used instead. This is the default value. SelectType = select/cons_tres # # SelectTypeParameters # The permitted values of SelectTypeParameters depend upon the configured value of SelectType. # The only supported options for SelectType=select/linear are CR_ONE_TASK_PER_CORE # and CR_Memory, which treats memory as a consumable resource and prevents # memory over subscription with job preemption or gang scheduling. By default # SelectType=select/linear allocates whole nodes to jobs without considering # their memory consumption. By default SelectType=select/cons_res, # SelectType=select/cray_aries, and SelectType=select/cons_tres, use CR_Core_Memory, # which allocates Core to jobs with considering their memory consumption. # # A restart of slurmctld is required for changes to this parameter to take effect. # # The following options are supported for SelectType=select/cray_aries: OTHER_CONS_RES, OTHER_CONS_TRES # # The following options are supported by the SelectType=select/cons_res and SelectType=select/cons_tres plugins: # CR_CPU # CPUs are consumable resources. Configure the number of CPUs on each node, # which may be equal to the count of cores or hyper-threads on the node # depending upon the desired minimum resource allocation. The node's Boards, # Sockets, CoresPerSocket and ThreadsPerCore may optionally be configured # and result in job allocations which have improved locality; however doing # so will prevent more than one job from being allocated on each core. # CR_CPU_Memory # CPUs and memory are consumable resources. Configure the number of CPUs on # each node, which may be equal to the count of cores or hyper-threads on # the node depending upon the desired minimum resource allocation. The node's # Boards, Sockets, CoresPerSocket and ThreadsPerCore may optionally be configured # and result in job allocations which have improved locality; however doing so # will prevent more than one job from being allocated on each core. Setting # a value for DefMemPerCPU is strongly recommended. # CR_Core # Cores are consumable resources. On nodes with hyper-threads, each thread # is counted as a CPU to satisfy a job's resource requirement, but multiple # jobs are not allocated threads on the same core. The count of CPUs allocated # to a job is rounded up to account for every CPU on an allocated core. This # will also impact total allocated memory when --mem-per-cpu is used to be # multiply of total number of CPUs on allocated cores. # CR_Core_Memory # Cores and memory are consumable resources. On nodes with hyper-threads, # each thread is counted as a CPU to satisfy a job's resource requirement, # but multiple jobs are not allocated threads on the same core. The count of # CPUs allocated to a job may be rounded up to account for every CPU on an # allocated core. Setting a value for DefMemPerCPU is strongly recommended. # CR_ONE_TASK_PER_CORE # Allocate one task per core by default. Without this option, by default # one task will be allocated per thread on nodes with more than one ThreadsPerCore # configured. NOTE: This option cannot be used with CR_CPU*. # CR_CORE_DEFAULT_DIST_BLOCK # Allocate cores within a node using block distribution by default. This # is a pseudo-best-fit algorithm that minimizes the number of boards and # minimizes the number of sockets (within minimum boards) used for the allocation. # This default behavior can be overridden specifying a particular "-m" parameter # with srun/salloc/sbatch. Without this option, cores will be allocated cyclically # across the sockets. # CR_LLN # Schedule resources to jobs on the least loaded nodes (based upon the number # of idle CPUs). This is generally only recommended for an environment with # serial jobs as idle resources will tend to be highly fragmented, resulting # in parallel jobs being distributed across many nodes. Note that node Weight # takes precedence over how many idle resources are on each node. Also see # the partition configuration parameter LLN use the least loaded nodes in selected partitions. # CR_Pack_Nodes # If a job allocation contains more resources than will be used for launching # tasks (e.g. if whole nodes are allocated to a job), then rather than # distributing a job's tasks evenly across its allocated nodes, pack them # as tightly as possible on these nodes. For example, consider a job allocation # containing two entire nodes with eight CPUs each. If the job starts ten tasks # across those two nodes without this option, it will start five tasks on each # of the two nodes. With this option, eight tasks will be started on the first # node and two tasks on the second node. This can be superseded by "NoPack" # in srun's "--distribution" option. CR_Pack_Nodes only applies when the "block" # task distribution method is used. # CR_Socket # Sockets are consumable resources. On nodes with multiple cores, each core # or thread is counted as a CPU to satisfy a job's resource requirement, but # multiple jobs are not allocated resources on the same socket. # CR_Socket_Memory # Memory and sockets are consumable resources. On nodes with multiple cores, # each core or thread is counted as a CPU to satisfy a job's resource requirement, # but multiple jobs are not allocated resources on the same socket. Setting # a value for DefMemPerCPU is strongly recommended. # CR_Memory # Memory is a consumable resource. NOTE: This implies OverSubscribe=YES or # OverSubscribe=FORCE for all partitions. Setting a value for DefMemPerCPU # is strongly recommended. # # NOTE: If memory isn't configured as a consumable resource (CR_CPU, # CR_Core or CR_Socket without _Memory) memory can be oversubscribed. # In this case the --mem option is only used to filter out nodes with lower # configured memory and does not take running jobs into account. For instance, # two jobs requesting all the memory of a node can run at the same time. SelectTypeParameters = CR_CPU_MEMORY # # Enable the use of scrontab to submit and manage periodic repeating jobs. ScronParameters = enable # # Preemption # https://slurm.schedmd.com/preempt.html # PreemptMode: # Values: OFF, CANCEL, GANG, REQUEUE, SUSPEND PreemptMode = {{PreemptMode}} PreemptType = {{PreemptType}} PreemptExemptTime = {{PreemptExemptTime}} # # JOB PRIORITY # # PriorityType # Default is priority/basic # Using multifactor so the interactive partition has priority over the regress partitiion # https://slurm.schedmd.com/priority_multifactor.html {% if UseAccountingDatabase %} # Multifactor uses 9 factors: # * age # * association # * fair-share # * job size # * nice # * partition # * QOS # * site # * TRES PriorityType=priority/multifactor PriorityWeightPartition=100000 PriorityWeightFairshare=10000 PriorityWeightQOS=10000 PriorityWeightAge=1000 PriorityWeightAssoc=0 PriorityWeightJobSize=0 {% else %} PriorityType=priority/basic {% endif %} # # # LOGGING # # SlurmctldDebug: quiet, fatal, error, info, verbose, debug, debug2, debug3, debug4, debug5 SlurmctldDebug=info SlurmctldLogFile=/var/log/slurm/slurmctld.log # # SlurmctldDebug: quiet, fatal, error, info, verbose, debug, debug2, debug3, debug4, debug5 SlurmdDebug=info SlurmdLogFile=/var/log/slurm/slurmd.log # # DebugFlags: Defines specific subsystems which should provide more detailed event logging. # NO_CONF_HASH: Do not log when the slurm.conf files differ between Slurm daemons # Script: Debug info regarding the process that runs slurmctld scripts such as PrologSlurmctld and EpilogSlurmctl DebugFlags=\ NO_CONF_HASH\ ,Script\ ,Elasticsearch\ ,Federation\ ,License # # # # ACCOUNTING # {% if UseAccountingDatabase %} AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost={{AccountingStorageHost}} AccountingStoragePort=6819 # # AccountingStorageTRES # Comma-separated list of resources you wish to track on the cluster. # By default Billing, CPU, Energy, Memory, Node, FS/Disk, Pages and VMem are tracked. #AccountingStorageTRES= # # AccountingStoreFlags # Comma separated list used to tell the slurmctld to store extra fields that may be more heavy weight than the normal job information. # job_comment: Include the job's comment field in the job complete message sent to the Accounting Storage database. # job_env: # job_script: AccountingStoreFlags=job_comment JobAcctGatherType = jobacct_gather/linux {% else %} AccountingStorageType=accounting_storage/none {% endif %} # # # JobCompType: The job completion logging mechanism type. # The information captured is redundant with the Accounting database # configured by AccountingStorageType. # jobcomp/none: Upon job completion, a record of the job is purged from the system. # jobcomp/elasticsearch: # Upon job completion, a record of the job should be written to an Elasticsearch # server, specified by the JobCompLoc parameter. # See https://slurm.schedmd.com/elasticsearch.html # jobcomp/filetxt: Upon job completion, a record of the job should be written to a # text file, specified by the JobCompLoc parameter. # jobcomp/lua: a record of the job should be processed by the jobcomp.lua scrip # jobcomp/mysql: A record of the job should be written to a MySQL or MariaDB # database, specified by the JobCompLoc parameter. # jobcomp/script: Upon job completion, a script specified by the JobCompLoc parameter # is to be executed with environment variables providing the job information. JobCompType={{JobCompType}} {% if JobCompType == "jobcomp/elasticsearch" %} JobCompLoc = {{JobCompLoc}} JobCompParams = timeout=5,connect_timeout=5 {% endif %} {% if JobCompType == "jobcomp/filetxt" %} JobCompLoc={{SlurmAccountingDir}}/job_accounting.txt {% endif %} # # # SlurmctldParameters # cloud_dns: Do not set cloud_dns unless joining the domain and adding nodes to DNS # Joining the domain overloads the DCs when lots of nodes are started so # not joining the domain or using cloud_dns. # idle_on_node_suspend: Mark nodes as idle, regardless of current state, when suspending # nodes with SuspendProgram so that nodes will be eligible to be resumed at a later time. # node_reg_mem_percent: Percentage of memory a node is allowed to register with without being marked as invalid with low memory. # Use this so can configure nodes with 100% of their memory. # Without this option the node will fail because the system uses some of the memory. SlurmctldParameters=\ idle_on_node_suspend,\ ,node_reg_mem_percent=90 # # Allow users to see state of nodes that are powered down PrivateData = cloud # FederationParameters=fed_display # # # POWER SAVE SUPPORT FOR IDLE NODES # {% if SuspendAction == "stop" %} SuspendProgram = {{SlurmScriptsDir}}/slurm_ec2_stop.py {% else %} SuspendProgram = {{SlurmScriptsDir}}/slurm_ec2_terminate.py {% endif %} ResumeProgram = {{SlurmScriptsDir}}/slurm_ec2_resume.py ResumeFailProgram = {{SlurmScriptsDir}}/slurm_ec2_resume_fail.py # Maximum time between when a node is suspended and when suspend is complete. # At that time it should be ready to be resumed. SuspendTimeout = 60 ResumeTimeout = 600 # Number of nodes per minute that can be resumed or suspended ResumeRate = 300 SuspendRate = 60 # Time that a node has to be idle or down before being suspended # Should be >= (SuspendTimeout + ResumeTimeout) SuspendTime = 660 # Configure slurmrestd AuthAltTypes = auth/jwt AuthAltParameters = jwt_key={{SlurmSpoolDir}}/jwt_hs256.key include slurm_licenses.conf include slurm_nodes.conf {% if ON_PREM_COMPUTE_NODES_CONFIG is defined %} include {{ON_PREM_COMPUTE_NODES_CONFIG}} {% endif %} include slurm_tres.conf {% if SLURM_CONF_OVERRIDES_PATH is defined %} include {{SLURM_CONF_OVERRIDES_PATH}} {% endif %}