/* * Copyright (c) 2020-2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef NCCL_OFI_PARAM_H_ #define NCCL_OFI_PARAM_H_ #ifdef _cplusplus extern "C" { #endif #include #include #include #include #include #include "nccl_ofi_log.h" #define OFI_NCCL_PARAM_INT(name, env, default_value) \ static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \ static inline int64_t ofi_nccl_##name() { \ static bool initialized = false; \ static int64_t value = default_value; \ if (initialized) { \ return value; \ } \ pthread_mutex_lock(&ofi_nccl_param_lock_##name); \ int64_t v; \ char *str, *endptr; \ if (!initialized) { \ str = getenv("OFI_NCCL_" env); \ if (str && strlen(str) > 0) { \ errno = 0; \ v = strtoll(str, &endptr, 0); \ if (errno || str == endptr || *endptr != '\0') { \ NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, \ "Invalid value %s provided for %s environment variable, using default %lu", \ str, "OFI_NCCL_" env, value); \ } else { \ value = v; \ NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting %s environment variable to %lu", \ "OFI_NCCL_" env, value); \ } \ } \ initialized = true; \ } \ pthread_mutex_unlock(&ofi_nccl_param_lock_##name); \ return value; \ } #define OFI_NCCL_PARAM_STR(name, env, default_value) \ static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \ static inline const char *ofi_nccl_##name() { \ static bool initialized = false; \ static const char *value = default_value; \ if (initialized) { \ return value; \ } \ pthread_mutex_lock(&ofi_nccl_param_lock_##name); \ char *str; \ if (!initialized) { \ str = getenv("OFI_NCCL_" env); \ if (str) { \ value = strdup(str); \ if (value) { \ NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting %s environment variable to %s", \ "OFI_NCCL_" env, value); \ } else { \ value = default_value; \ NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, \ "Allocation error saving result for %s environment variable. Falling back to default %s", \ "OFI_NCCL_" env, value); \ } \ } \ initialized = true; \ } \ pthread_mutex_unlock(&ofi_nccl_param_lock_##name); \ return value; \ } /* * Enable using endpoints with IPv6 addressing format for TCP provider. * By default, we disable using endpoints having IPv6 addressing format. */ OFI_NCCL_PARAM_INT(use_ipv6_tcp, "USE_IPV6_TCP", 0); /* * List of interface names (comma-separated) to be filtered out for TCP provider. * By default, it is set to eliminate lo and docker0 interfaces. * * TODO: Remove lo after https://github.com/ofiwg/libfabric/issues/6127 is fixed */ OFI_NCCL_PARAM_STR(exclude_tcp_if, "EXCLUDE_TCP_IF", "lo,docker0"); /* * Disable flush operation when using GPUDirect. Flush commands * are used to enforce data consistency at the receiving GPU. It should * only be disabled when underlying libfabric provider or hardware * ensures data consistency. * By default, plugin issues flush commands. */ OFI_NCCL_PARAM_INT(gdr_flush_disable, "GDR_FLUSH_DISABLE", 0); /* * Specify the number of network connections created by * NIC_DUP_CONNS. Each chosen Libfabric provider will be duplicated N * times and exposed to NCCL as a unique endpoint. */ OFI_NCCL_PARAM_INT(nic_dup_conns, "NIC_DUP_CONNS", 0); /* * When using GPUDirect use the cudaDeviceFlushGPUDirectRDMAWrites * to enforce data consistency at the receiving GPU. Requires CUDA 11.3 or * later. Note that this function only provides a GPU memory fence and requires * that data has already been delivered to GPU memory. Some networks and * PCIe configurations require an additional network-level flush that * is not provided by this option. */ OFI_NCCL_PARAM_INT(cuda_flush_enable, "CUDA_FLUSH_ENABLE", 0); /* * Specify the memory registration key size in bytes when using a libfabric * provider that supports application-selected memory registration keys. */ OFI_NCCL_PARAM_INT(mr_key_size, "MR_KEY_SIZE", 2); /* * Maximum number of cq entries to read in a single call to * fi_cq_read. */ OFI_NCCL_PARAM_INT(cq_read_count, "CQ_READ_COUNT", 4); /* * Protocol to use for send/recv operations. Valid options are * SENDRECV and RDMA, with SENDRECV the default. Default param is * NULL so that we can determine if user set the option. */ OFI_NCCL_PARAM_STR(protocol, "PROTOCOL", NULL); /* * When enabled and RDMA communication protocol is used, write NCCL * topology file and set environment variable `NCCL_TOPO_FILE`. By * default, OFI plugin writes the NCCL topology file to a unique * temporary file using file path template * `/tmp/aws-ofi-nccl-topo-XXXXXX` and the file is deleted at normal * process termination. See environment variable * `OFI_NCCL_TOPO_FILE_TEMPLATE` to control the file destination. */ OFI_NCCL_PARAM_INT(topo_file_write_enable, "TOPO_FILE_WRITE_ENABLE", 0); /* * User defined file path template that is used in case NCCL topology * file is written. The last six characters of the template must be * XXXXXX and will be replaced to make the filename unique. */ OFI_NCCL_PARAM_STR(topo_file_template, "TOPO_FILE_TEMPLATE", NULL); /* * Disable the native RDMA write support check when using the "RDMA" protocol * for send/recv operations on AWS platforms. When the check is disabled, the * "RDMA" protocol can be used even on platforms where native RDMA write is not * supported or cannot be verified to be supported. By default, the plugin * peforms the native RDMA support checks. */ OFI_NCCL_PARAM_INT(disable_native_rdma_check, "DISABLE_NATIVE_RDMA_CHECK", 0); /* * Maximum size of a message in bytes before message is multiplexed */ OFI_NCCL_PARAM_INT(round_robin_threshold, "ROUND_ROBIN_THRESHOLD", 8192); /* * Minimum bounce buffers posted per endpoint. The plugin will attempt to post * more bounce buffers if we dip below this threshold, allocating new bounce * buffers if needed. */ OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 64); /* * Maximum bounce buffers posted per endpoint. The plugin will not attempt to * post more bounce buffers if we reach this threshold, returning available * buffers to the free list if needed */ OFI_NCCL_PARAM_INT(rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", 128); /* * Internode network latency reported to NCCL. Defaults to 0, unless the configured * platform sets a specific value. */ OFI_NCCL_PARAM_INT(net_latency, "NET_LATENCY", -1); /* * Eager message size limit when using RDMA protocol. Message sizes greater than * this limit will always be sent using RDMA write instead of eagerly. */ OFI_NCCL_PARAM_INT(eager_max_size, "EAGER_MAX_SIZE", 8192); #ifdef _cplusplus } // End extern "C" #endif #endif // End NCCL_OFI_PARAM_H_