/* * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ #include "config.h" #define _GNU_SOURCE #include #include #include #include #include #include #ifdef HAVE_RDMA_FI_EXT_H #include #endif #include "nccl_ofi.h" #include "nccl_ofi_log.h" #include "nccl_ofi_param.h" #include "tracepoint.h" static bool sendrecv_support_ll128 = false; static bool write_support_ll128 = false; static bool disable_native_rdma_check; const char *platform_type; struct ec2_platform_data { const char* name; const char* topology; int default_dup_conns; float latency; } platform_data_map[] = { { .name = "p4d.24xlarge", .topology = "p4d-24xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, }, { .name = "p4de.24xlarge", .topology = "p4de-24xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, }, { .name = "p3dn.24xlarge", .topology = NULL, .default_dup_conns = 4, .latency = 150.0, }, { .name = "p5.48xlarge", .topology = "p5.48xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, }, }; /* * @brief Provides EC2 platform type as reported by the * first line of * /sys/devices/virtual/dmi/id/product_name. * Users of this API *should* free the buffer when a * Non-NULL string is returned. * * @return NULL, on allocation and file system error * EC2 platform type, on success */ static const char* get_platform_type(void) { char file[] = "/sys/devices/virtual/dmi/id/product_name"; FILE *fd = NULL; char ch; size_t len = 0; size_t platform_type_len = 64; char *platform_type = NULL; fd = fopen(file, "r"); if (fd == NULL) { NCCL_OFI_WARN("Error opening file: %s", file); goto error; } platform_type = (char *)malloc(sizeof(char)*platform_type_len); if (platform_type == NULL) { NCCL_OFI_WARN("Unable to allocate platform type"); goto error; } /* Read first line of the file, reallocing the buffer as necessary */ while ((feof(fd) == 0) && (ferror(fd) == 0) && ((ch = fgetc(fd)) != '\n')) { platform_type[len++] = ch; if (len >= platform_type_len) { platform_type = realloc(platform_type, len + platform_type_len); } } if (ferror(fd)) { NCCL_OFI_WARN("Error reading file: %s", file); goto error; } platform_type[len] = '\0'; NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Read %d bytes. EC2 platform type is %s", len, platform_type); fclose(fd); return platform_type; error: if (platform_type) free(platform_type); if (fd) fclose(fd); return platform_type; } /* * @brief Returns static topology filename for given platform type, if found * * @input Platform type * * @return NULL, if no topology found * Topology filename, if match found */ struct ec2_platform_data *get_platform_data(const char *platform_type) { const size_t platform_n = sizeof(platform_data_map)/sizeof(platform_data_map[0]); for (size_t idx = 0; idx < platform_n; idx++) { if (strcmp(platform_type, platform_data_map[idx].name) == 0) return &platform_data_map[idx]; } return NULL; } static ncclResult_t configure_nccl_proto(struct ec2_platform_data *platform_data) { int ret = ncclSuccess; /* Explicitly set the simple protocol using the "NCCL_PROTO" environment * variable whenever we know that the LL/LL128 protocols are not safe, * such as on P4d/P4e. * * This only has impact on the Nvidia CUDA case, as the * Tranium code does not use the LL/LL128 protocols. */ bool support_ll128_proto = sendrecv_support_ll128 || write_support_ll128; if (!support_ll128_proto) { if (!getenv("NCCL_PROTO")) { NCCL_OFI_INFO(NCCL_INIT, "Setting NCCL_PROTO to \"simple\""); int rc = setenv("NCCL_PROTO", "simple", 0); if (rc) { NCCL_OFI_WARN("Error setting NCCL_PROTO environment variable: %d", rc); ret = ncclSystemError; goto exit; } } else if (strcmp(getenv("NCCL_PROTO"), "simple")) { NCCL_OFI_WARN("NCCL_PROTO was set to \"LL/LL128\", but the Libfabric endpoint does not support 128 byte in-order aligned stores. This endpoint may corrupt data during communication"); } } exit: return ret; } static ncclResult_t validate_rdma_write(struct fid_ep *ep) { int ret = ncclSuccess; #if HAVE_DECL_FI_OPT_EFA_EMULATED_WRITE bool optval; size_t optlen = 0; ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_EMULATED_WRITE, &optval, &optlen); if(ret != 0 || optlen != sizeof(bool)) { NCCL_OFI_WARN("Couldn't get FI_OPT_EFA_EMULATED_WRITE. optlen: %lu, RC: %d, ERROR: %s", optlen, ret, fi_strerror(-ret)); ret = ncclSystemError; goto exit; } /* If the selected protocol is RDMA write and RDMA write is not * supported for the endpoint, throw an error */ else if (optval && 0 == strcmp("RDMA", nccl_ofi_selected_protocol)) { NCCL_OFI_WARN("FI_OPT_EFA_EMULATED_WRITE is true when the communication protocol is RDMA write."); ret = ncclSystemError; goto exit; } NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Get endpoint option FI_OPT_EFA_EMULATED_WRITE. optval: %d", optval); #else NCCL_OFI_WARN("FI_OPT_EFA_EMULATED_WRITE not declared when the communication protocol is RDMA write."); ret = ncclSystemError; goto exit; #endif exit: return ret; } static ncclResult_t configure_sendrecv_inorder(struct fid_ep *ep, bool is_init) { int ret = ncclSuccess; #if HAVE_DECL_FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES bool optval = true; ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, &optval, sizeof(optval)); if (ret != 0 && ret != -FI_EOPNOTSUPP) { NCCL_OFI_WARN("Couldn't set FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES. RC: %d, ERROR: %s", ret, fi_strerror(-ret)); ret = ncclSystemError; goto exit; } /* If this is called during plugin initialization, set the global flag * sendrecv_support_ll128 to true if * FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES could be set to true, * otherwise keep it at its default value of false. */ if (is_init) { if (ret == 0) { sendrecv_support_ll128 = true; } } /* If an endpoint supported SENDRECV LL128 during plugin initialization * but does not support it now, throw an error. */ else if (sendrecv_support_ll128 && ret == -FI_EOPNOTSUPP) { NCCL_OFI_WARN("SENDRECV LL128 not supported while it was supported during initialization."); ret = ncclSystemError; goto exit; } /* If an endpoint did not support SENDRECV LL128 during plugin * initialization but supports it now, throw an error. */ else if (!sendrecv_support_ll128 && ret == 0) { NCCL_OFI_WARN("SENDRECV LL128 supported while it not supported during initialization."); ret = ncclSystemError; goto exit; } NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES. optval: %d, RC: %d, ERROR: %s", optval, ret, fi_strerror(-ret)); ret = ncclSuccess; exit: #endif return ret; } static ncclResult_t configure_write_inorder(struct fid_ep *ep, bool is_init) { int ret = ncclSuccess; #if HAVE_DECL_FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES bool optval = true; ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, &optval, sizeof(optval)); if (ret != 0 && ret != -FI_EOPNOTSUPP) { NCCL_OFI_WARN("Couldn't set FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES. RC: %d, ERROR: %s", ret, fi_strerror(-ret)); ret = ncclSystemError; goto exit; } /* If this is called during plugin initialization, set the global flag * write_support_ll128 to true if * FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES could be set to true, * otherwise keep it at its default value of false. */ if (is_init) { if (ret == 0) { write_support_ll128 = true; } } /* If an endpoint supported WRITE LL128 during plugin initialization but * does not support it now, throw an error. */ else if (write_support_ll128 && ret == -FI_EOPNOTSUPP) { NCCL_OFI_WARN("WRITE LL128 not supported while it was supported during initialization."); ret = ncclSystemError; goto exit; } /* If an endpoint did not support SENDRECV LL128 during plugin * initialization but supports it now, throw an error. */ else if (!write_support_ll128 && ret == 0) { NCCL_OFI_WARN("WRITE LL128 supported while it not supported during initialization."); ret = ncclSystemError; goto exit; } NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES. optval: %d, RC: %d, ERROR: %s", optval, ret, fi_strerror(-ret)); ret = ncclSuccess; exit: #endif return ret; } /* * @brief Update NCCL's system topology using static pre-configured topology * files for supported EC2 platform types. * * @return 0, when we are succesfully able to update NCCL topology or * if we find no match * error, on failure */ ncclResult_t platform_init(void) { int ret = ncclSuccess; int rc = 0; struct ec2_platform_data *platform_data; uint32_t libversion = 0; NCCL_OFI_INFO(NCCL_INIT, "Configuring AWS-specific options"); platform_type = get_platform_type(); if (platform_type == NULL) { ret = ncclSystemError; goto exit; } platform_data = get_platform_data(platform_type); /* if we're here, we think we're on an EC2 instance, so force * EFA provider (for platforms without EFA, this will cause a * fallback to NCCL's internal TCP. In the case of Neuron, a * hard failure when there are no NICs. Both are the * behaviors we want). */ if (!getenv("FI_PROVIDER")) { NCCL_OFI_INFO(NCCL_INIT, "Setting provider_filter to efa"); provider_filter = "efa"; } #if HAVE_CUDA /* * FI_EFA_FORK_SAFE environment variable tells Libfabric to enable * fork-safe support in legacy versions of the rdma-core library. * Libfabric checks if additional handling is required for fork safety, * and does not introduce this additional overhead of setting MADV_DONTFORK * for new versions of rdma-core (38.0 and later) and the Linux kernel * that support copy-on-fork for pinned memory (5.13 and later). * These new versions are always fork-safe and additional support in userspace * is not required. * * When legacy versions of the kernel and rdma-core are used, setting * FI_EFA_FORK_SAFE to 1 disables the use of huge pages in Libfabric. * * To prevent data corruption, the EFA provider registers an atfork * handler which will abort the process whenever it believes * rdma-core is not fork-safe. * * NCCL applications heavily re-use the buffers for communication and * thus are not sensitive to increased memory registration costs. * To prevent NCCL based applications from getting aborted when using * fork(), the plugin explicitly enables FI_EFA_FORK_SAFE environment * variable, even in legacy environments where the overhead is high. * * The Neuron team has asked us to skip trying to set this * environment variable on Neuron platforms, so we only do * this for Nvidia platforms. */ libversion = fi_version(); const char * fork_safe_var_name = (FI_MAJOR(libversion) > 1 || (FI_MAJOR(libversion) == 1 && FI_MINOR(libversion) >= 13)) ? "FI_EFA_FORK_SAFE" : "RDMAV_FORK_SAFE"; if (!getenv(fork_safe_var_name)) { NCCL_OFI_INFO(NCCL_INIT, "Setting %s environment variable to 1", fork_safe_var_name); rc = setenv(fork_safe_var_name, "1", 1); if (rc != 0) { NCCL_OFI_WARN("Unable to set %s", fork_safe_var_name); ret = ncclSystemError; goto exit; } } /* Disable NVLS topology discovery. There's a bug with EFA * and NCCL 2.17/2.18 that is still under investigation that * causes random failures due to memory corruption during * initialization. For now, skip that code. We need to come * back to this when the bug is fixed. */ if (getenv("NCCL_NVLS_ENABLE") == NULL) { NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Disabling NVLS support when using Libfabric on AWS."); rc = setenv("NCCL_NVLS_ENABLE", "0", 1); if (rc != 0) { NCCL_OFI_WARN("Unable to set NCCL_NVLS_ENABLE"); ret = ncclSystemError; goto exit; } } #endif /* * Update topology if platform topology is available and * environment variable NCCL_TOPO_FILE is not set. */ if (getenv("NCCL_TOPO_FILE")) { NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Running on %s platform, NCCL_TOPO_FILE environment variable is already set to %s", platform_type, getenv("NCCL_TOPO_FILE")); } else if (platform_data && platform_data->topology) { char topology_path[PATH_MAX]; rc = snprintf(topology_path, sizeof(topology_path), "%s/%s", XML_DIR, platform_data->topology); if (rc < 0 || rc >= sizeof(topology_path)) { NCCL_OFI_WARN("Error occurred while forming the complete topology XML file path. RC: %d, Buffer Size: %d, XML dir: %s, Topology file: %s", rc, PATH_MAX, XML_DIR, platform_data->topology); ret = ncclSystemError; goto exit; } NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Running on %s platform, Setting NCCL_TOPO_FILE environment variable to %s", platform_type, topology_path); rc = setenv("NCCL_TOPO_FILE", topology_path, 1); if (rc != 0) { NCCL_OFI_WARN("Unable to set NCCL_TOPO_FILE"); ret = ncclSystemError; goto exit; } } if (nic_dup_conns == 0 && platform_data) nic_dup_conns = platform_data->default_dup_conns; disable_native_rdma_check = (bool) ofi_nccl_disable_native_rdma_check(); if (ofi_nccl_net_latency() < 0) { if (platform_data && platform_data->latency >= 0.0) { net_latency = platform_data->latency; } else { /* For historical reasons, default value for EFA is 150 us */ net_latency = 150.0; } NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Internode latency set at %.1f us", net_latency); } exit: return ret; } ncclResult_t platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) { static bool is_init = true; int ret = ncclSuccess; if (endpoint == NULL) { NCCL_OFI_WARN("Unable to configure invalid endpoint"); ret = ncclSystemError; goto exit; } /* short circuit when not using EFA */ if (0 != strcmp(info->fabric_attr->prov_name, "efa")) { ret = ncclSuccess; goto exit; } /* If the selected communication protocol is RDMA write and the user did * not disable the native RDMA support check, validate that the * FI_OPT_EFA_EMULATED_WRITE endpoint option can be accessed, and that * emulated writes are disabled. */ if (0 == strcmp("RDMA", nccl_ofi_selected_protocol) && !disable_native_rdma_check) { ret = validate_rdma_write(endpoint); if (ret != 0) { goto exit; } } #if HAVE_CUDA /* During initialization, if the chosen communication protocol is * SENDRECV, try to set FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES * to true to see if the LL/LL128 protocol is supported. After * initialization, try to set the option to true again and if the * LL/LL128 protocols are not supported for SENDRECV and were supported * in initialization, throw an error. */ if (0 == strcmp("SENDRECV", nccl_ofi_selected_protocol)) { ret = configure_sendrecv_inorder(endpoint, is_init); if (ret != 0) { goto exit; } } /* During initialization, if the chosen communication protocol is RDMA, try to * set FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES to true to see if the * LL/LL128 protocol is supported. After initialization, try to set the option * to true again and if the LL/LL128 protocols are not supported for RDMA and * were supported in initialization, throw an error. */ else if (0 == strcmp("RDMA", nccl_ofi_selected_protocol)) { ret = configure_write_inorder(endpoint, is_init); if (ret != 0) { goto exit; } } /* if this is called during the plugin initialization, determine whether * to explicitly set NCCL_PROTO to "simple" based on whether we support * the LL/LL128 NCCL protocols. */ if (is_init) { struct ec2_platform_data *platform_data = get_platform_data(platform_type); ret = configure_nccl_proto(platform_data); if (ret != 0) { goto exit; } } #endif // HAVE_CUDA exit: /* if this is called during the plugin initialization, indicate that the * intialzation has already completed and should not be done again */ if (is_init) { is_init = false; } return ret; }