/* * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef NCCL_OFI_H_ #define NCCL_OFI_H_ #ifdef _cplusplus extern "C" { #endif #include #include #include #include #include #include #include #include #if HAVE_NEURON #include "nccl-headers/net_neuron.h" #else #include "nccl-headers/net.h" #endif #include "nccl_ofi_log.h" #include "nccl_ofi_topo.h" #ifdef __GNUC__ #define OFI_LIKELY(x) __builtin_expect((x), 1) #define OFI_UNLIKELY(x) __builtin_expect((x), 0) #else #define OFI_LIKELY(x) (x) #define OFI_UNLIKELY(x) (x) #endif #define MAX_PROV_INFO (15) #define MAX_BDF_LEN (25) /* * NCCL_NET_HANDLE_MAXSIZE is a limited resource (and defined in NCCL). * An endpoint address buffer of 56 bytes *should* be large enough to hold * all libfabric providers. In case the requirement changes, NCCL v2.12 * provides enough room to increase this size but we would need to maintain * backwards compatiblity with all NCCL versions. * * We also store tags and communicator stage information in remaining * part of the handle. */ #define MAX_EP_ADDR (56) /* * For each tag, we use MSB as control bit and remaining * for identifying different rings. We look at mem_tag_format for * an endpoint to determine if provider is reserving any MSBs. */ #define OFI_HIGHEST_TAG_BIT (0x1UL << 63) /* * We are supporting minimum 2^32 rings per endpoint and reserving 1 bit * for marking control sends/recvs. */ #define MIN_TAG_BITS_FOR_RING_ID (32 + 1) /* Maximum number of grouped receives */ #define NCCL_OFI_MAX_RECVS 1 /* * This defines a higher value than maximum inflight requests supported by NCCL * while not putting a lot of memory pressure. This higher number ensures that * we are able to support more number of outstanding requests with dynamic buffer * depth changes in NCCL and Neuron. */ #define NCCL_OFI_MAX_REQUESTS (128) #if HAVE_NEURON _Static_assert(NCCL_NET_NEURON_MAX_REQUESTS <= NCCL_OFI_MAX_REQUESTS, "Maximum outstanding requests for plugin is less than what Neuron requires"); #else _Static_assert(NCCL_NET_MAX_REQUESTS <= NCCL_OFI_MAX_REQUESTS, "Maximum outstanding requests for plugin is less than what NCCL requires"); #endif /* Maximum length of directory path */ #define PATH_MAX 4096 /* Flush read size (bytes) */ #define NCCL_OFI_FLUSH_SIZE 4 // Logger Function extern ncclDebugLogger_t ofi_log_function; // Maximum numbers of requests supported by plugin extern int max_reqs; /* Indicates if GPUDirect is supported by libfabric provider */ enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED}; extern enum gdr_support_level_t support_gdr; /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ extern bool cuda_flush; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional * rings to use the connections */ extern int nic_dup_conns; /* only allow providers in the comma-separated list provider_filter. Default is no filter. Used by platform files; users can get the same behavior by setting FI_PROVIDER directly. */ extern const char *provider_filter; /* number of cq entries to read in a single call to fi_cq_read. This variable will be updated during init (hence, can not be const), but will not change during execution. Therefore, it may be read in the polling loop without protection of a lock. */ extern size_t cq_read_count; /* Indicates if memory registration of local buffers is required */ extern bool local_mr; /* Indicates if endpoint memory registration is required */ extern bool endpoint_mr; /* Indicates if remote virtual addressing is used */ extern bool virt_addr_mr; /* Selected communication protocol. * * Until the protocol environment variable is checked in init(), this * is the protocol that the plugin will try to initialize; it can be * overridden by platform_init(). After init(), this is the protocol * that was selected. * * Valid values are SENDRECV and RDMA; default is SENDRECV (set by the * param OFI_NCCL_PROTOCOL) */ extern const char *nccl_ofi_selected_protocol; /* Internode network latency reported to NCCL. */ extern float net_latency; struct nccl_net_ofi_plugin; struct nccl_net_ofi_device; struct nccl_net_ofi_ep; struct nccl_net_ofi_req; struct nccl_net_ofi_mr_handle; struct nccl_net_ofi_comm; struct nccl_net_ofi_listen_comm; struct nccl_net_ofi_send_comm; struct nccl_net_ofi_recv_comm; typedef struct nccl_net_ofi_plugin nccl_net_ofi_plugin_t; typedef struct nccl_net_ofi_device nccl_net_ofi_device_t; typedef struct nccl_net_ofi_ep nccl_net_ofi_ep_t; typedef struct nccl_net_ofi_req nccl_net_ofi_req_t; typedef struct nccl_net_ofi_mr_handle nccl_net_ofi_mr_handle_t; typedef struct nccl_net_ofi_comm nccl_net_ofi_comm_t; typedef struct nccl_net_ofi_listen_comm nccl_net_ofi_listen_comm_t; typedef struct nccl_net_ofi_send_comm nccl_net_ofi_send_comm_t; typedef struct nccl_net_ofi_recv_comm nccl_net_ofi_recv_comm_t; /* nccl_net_ofi plugin */ extern nccl_net_ofi_plugin_t *plugin; /** * Request - handle for an outstanding non-blocking communication * * A request will be allocated and returned for every call to send, * recv, or flush. Memory is allocated by the callee to send, recv, * or flush, and will be freed by the callee of test when the request * is complete. */ struct nccl_net_ofi_req { ncclResult_t (*test)(nccl_net_ofi_req_t *req, int *done, int *size); }; typedef struct stack { int top; int size; /* * Array of stack entries comes after stack structure. size field * indicates the size of the array. * NOTE: no more field is allowed beyond this point. */ int array[]; } stack_t; typedef struct free_list { /* Stack of free buffer indexes */ stack_t *free_index; /* Size of buffers array */ uint64_t size; /* * Array of free buffers comes after list head. * NOTE: no more field is allowed beyond this point. */ void *buffers[]; } free_list_t; /* Various stages of connection establishment */ typedef enum nccl_ofi_comm_stage { COMM_CREATE_START = 0, COMM_SEND_CONN, COMM_RECV_CONN, COMM_CONN_REQ_PENDING, COMM_CONN_RESP_REQ_PENDING, COMM_CONNECTED, } nccl_ofi_comm_stage_t; typedef struct save_comm_state { nccl_net_ofi_comm_t *comm; nccl_net_ofi_req_t *req; nccl_ofi_comm_stage_t stage; } save_comm_state_t; typedef struct nccl_ofi_connection_info { char ep_name[MAX_EP_ADDR]; uint64_t ep_namelen; uint64_t connect_to_self; nccl_net_ofi_req_t* req; } nccl_ofi_connection_info_t; typedef struct nccl_net_ofi_conn_handle { char ep_name[MAX_EP_ADDR]; uint64_t tag; /* Save temporary communicator state when creating send communicator */ save_comm_state_t state; } nccl_net_ofi_conn_handle_t; _Static_assert(sizeof(nccl_net_ofi_conn_handle_t) <= NCCL_NET_HANDLE_MAXSIZE, "Size of OFI Handle is too large"); /* * Memory registration key-pool for one rail. * * In the case that this struct does not provide keys, the key pool * array needs to be set to NULL. */ typedef struct nccl_ofi_mr_keypool { /* Size of the key pool */ size_t size; /* Key pool array. Array entries indicate whether key is * vacant or not. */ bool *mr_keys; /* Lock for concurrency on memory registration keys */ pthread_mutex_t lock; } nccl_ofi_mr_keypool_t; /** * Device Data * * A device is roughly a NIC (or a port on a NIC) or a multi-rail * group. While a multi-threaded app may create multiple endpoints * per device, the device data should be shared across multiple * threads in the same process. Sharable structures such as address * vectors, fabrics, and domains should be associated with a device * instead of an endpoint. */ struct nccl_net_ofi_device { /* this device's index in the plugin's devices array */ int dev_id; /* name of the device - should include the provider name, but may be augmented (in the case of mrail). Set during the transport's initialization, and should be read-only from that point. */ char *name; ncclResult_t (*get_properties)(int num_devices, nccl_net_ofi_device_t *base_dev, ncclNetProperties_t *props); /* * @brief Get nccl_ofi_ep for given * nccl_ofi_device. Create if it does not exist. Store * in pthread key. Increase reference counter. Must be * protected by lock stored in device. * * During the plugin initialization, this function will be * called once per process using one of the instantiated device structs * to create and configure the endpoint of the initializing thread. */ ncclResult_t (*get_ep)(nccl_net_ofi_device_t *base_dev, nccl_net_ofi_ep_t **ep); }; /** * Endpoint - A per-Proxy Thread device abstraction * * The device structure is shared across potentially multiple proxy * threads (depending on NCCL configuration). The Endpoint abstracts * a unique address (assuming an RDM provider), allowing for the * possibility that the underlying transport uses an endpoint per * thread (or per thread calling listen/connect) to drive traffic * across multiple Libfabric endpoints and completion queues. * * Endpoints are implicitly created as part of the get_ep() call * in the device interface. Whether they are created during the first * call to get_ep() or during initialization is left to the * implementation. */ struct nccl_net_ofi_ep { /* Backpointer to the device associated with this ep. */ nccl_net_ofi_device_t *device; /* Create a receiving object and provide a handle to it. * * The callee can expect that the handle provides * NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged across * the wire through an out of band mechanism. The callee must * allocate memory for listen_comm. * * The callee has to guarantee that the state stage of the * handle is set to COMM_CREATE_START. */ ncclResult_t (*listen)(nccl_net_ofi_ep_t *ep, nccl_net_ofi_conn_handle_t *handle, nccl_net_ofi_listen_comm_t **listen_comm); /* Create a connection to a process that has called * listen(). * * The callee has to guarantee the following invariants when * this function returns ncclSuccess and no send * communicator has been returned * 1) The state stage of the handle is set to a value * different from COMM_CREATE_START. * 2) The communicator state of the handle stores a pointer to * a communicator. Also, the endpoint pointer member variable * of that communicator points to the endpoint passed to * this connect() function. * * The callee must allocate memory for send_comm. */ ncclResult_t (*connect)(nccl_net_ofi_ep_t *ep, nccl_net_ofi_conn_handle_t *handle, nccl_net_ofi_send_comm_t **send_comm); /* * @brief Release nccl_ofi_ep. * * Decrease reference counter. Release resources and free * endpoint if reference counter becomes zero. Must be * protected by lock stored in base_dev. */ ncclResult_t (*release_ep)(nccl_net_ofi_ep_t *ep); }; enum nccl_net_ofi_comm_type_t { NCCL_NET_OFI_BASE_COMM, NCCL_NET_OFI_LISTEN_COMM, NCCL_NET_OFI_SEND_COMM, NCCL_NET_OFI_RECV_COMM, }; /** * Communicator - base class for communicator structures * * This is the base class for the listen, send, and recv * communicators. It should not be directly extended by transports, * but instead underlying transports should extend the listen, send, * and recv communicators. */ struct nccl_net_ofi_comm { enum nccl_net_ofi_comm_type_t type; nccl_net_ofi_ep_t *ep; int dev_id; }; /** * Listen Communicator - Communicator for a listen/accept pairing */ struct nccl_net_ofi_listen_comm { nccl_net_ofi_comm_t base; ncclResult_t (*accept)(nccl_net_ofi_listen_comm_t *listen_comm, nccl_net_ofi_recv_comm_t **recv_comm); ncclResult_t (*close)(nccl_net_ofi_listen_comm_t *listen_comm); }; struct nccl_net_ofi_send_comm { nccl_net_ofi_comm_t base; /* * @brief Register memory region on send communicator (both Host and CUDA) * * @return Memory handle for data send operations * @return 0 on success * non-zero on error */ ncclResult_t (*regMr)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int type, void **mhandle); /* * @brief Register DMA memory region on send communicator (both Host and CUDA) * * This operation is not supported. * * @return Memory handle for data send operations * @return ncclInternalError */ ncclResult_t (*regMrDmaBuf)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle); /* * @brief Deregister memory region on send communicator (both Host and CUDA) * * @return Memory handle for data send operations * @return 0 on success * non-zero on error */ ncclResult_t (*deregMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_net_ofi_mr_handle_t *mhandle); ncclResult_t (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int tag, nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **req); ncclResult_t (*close)(nccl_net_ofi_send_comm_t *send_comm); }; struct nccl_net_ofi_recv_comm { nccl_net_ofi_comm_t base; /* * @brief Register memory region on recv communicator (both Host and CUDA) * * @return Memory handle for data recv operations * @return 0 on success * non-zero on error */ ncclResult_t (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size, int type, void **mhandle); /* * @brief Register DMA memory region on recv communicator (both Host and CUDA) * * This operation is not supported. * * @return Memory handle for data recv operations * @return ncclInternalError */ ncclResult_t (*regMrDmaBuf)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size, int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle); /* * @brief Deregister memory region on recv communicator (both Host and CUDA) * * @return Memory handle for data recv operations * @return 0 on success * non-zero on error */ ncclResult_t (*deregMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_net_ofi_mr_handle_t *mhandle); ncclResult_t (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req); ncclResult_t (*flush)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req); ncclResult_t (*close)(nccl_net_ofi_recv_comm_t *recv_comm); }; /** * Top-level plugin data * * Data associated with an instance of the plugin (which may involve * multiple proxy threads and multiple devices). There will be a * single instance of this structure, exposed as a global variable * named nccl_net_ofi_plugin, which is valid after NCCL calls init() * on the plugin. */ struct nccl_net_ofi_plugin { /* Array of devices */ nccl_net_ofi_device_t **devs; /* Number of devices in devs array */ int num_devs; }; /** * Initialize plugin. This function sets properties of the global plugin variable * defined below. */ void nccl_net_ofi_init_plugin(nccl_net_ofi_device_t **base_devs, int num_infos); /* * @brief Set properties obtained from libfabric NIC Info. * * @return Populated props structure */ ncclResult_t nccl_net_ofi_info_properties(struct fi_info *nic_prov, int dev_id, int num_devices, ncclNetProperties_t *props); /* * @brief Allocates and initialises libfabric endpoint and AV. * * @return Endpoint ep * @return Address vector av */ ncclResult_t nccl_ofi_init_connection(struct fi_info *info, struct fid_domain *domain, struct fid_ep **ep, struct fid_av **av, struct fid_cq **cq); /* * @brief Allocates free list for NCCL OFI requests */ ncclResult_t allocate_ofi_fl(free_list_t **nccl_ofi_req_fl, size_t fl_size, size_t buffer_size); /* * @brief Release free list for NCCL OFI requests */ void free_ofi_fl(free_list_t *nccl_ofi_req_fl); /* * @brief Allocate a element from free_list fl. */ void *allocate_fl_buff(free_list_t *fl, size_t buff_sz, uint64_t *next_avail_index); /* * @brief Initialize memory registration keypool */ ncclResult_t nccl_ofi_mr_keys_init(nccl_ofi_mr_keypool_t *key_pool, bool provide_mr_keys); /* * @brief Returns provider info structure for the given NIC ID. */ struct fi_info *get_nic_info(int dev_id, struct fi_info *info_list); /* * @brief Release libfabric endpoint and address vector */ void nccl_ofi_ep_release_ofi(struct fid_ep *ep, struct fid_av *av, struct fid_cq *cq, int dev_id); /* * @brief Register DMA buffer for send comm. Unimplemented. */ ncclResult_t nccl_net_ofi_reg_mr_dma_buf_recv_comm(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size, int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle); /* * @brief Register DMA buffer for recv comm. Unimplemented. */ ncclResult_t nccl_net_ofi_reg_mr_dma_buf_send_comm(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle); /* * @brief Free a memory registration key */ ncclResult_t nccl_net_ofi_free_mr_key(nccl_ofi_mr_keypool_t *key_pool, uint64_t key); /* * @brief Allocate a memory registration key */ uint64_t nccl_net_ofi_allocate_mr_key(nccl_ofi_mr_keypool_t *key_pool); /* * @brief Free libfabric NIC info list. * * Frees each node of the list. No operation if list is NULL. * * @param info_list * List or circular list of libfabric NIC infos */ void nccl_net_ofi_free_info_list(struct fi_info *info_list); /* Declare a platform-specific initialization hook that can be * provided by platform-specific source files (such as the optionally * compiled platform_aws.c). The function is declared as a weak * symbol so that linkage will not break if no platform specific hook * is provided; in that case platform_init will be NULL at runtime. */ ncclResult_t platform_init(void) __attribute__((weak)); /* Declare a platform-specific endpoint configuration hook that can be * provided by platform-specific source files (such as the optionally * compiled platform_aws.c). The function is declared as a weak * symbol so that linkage will not break if no platform specific hook * is provided; in that case platform_config_endpoint will be NULL at runtime. */ ncclResult_t platform_config_endpoint(struct fi_info *info, struct fid_ep *ep) __attribute__((weak)); #ifdef _cplusplus } // End extern "C" #endif #endif // End NCCL_OFI_H_