/* * Copyright (c) 2015-2018 Cray Inc. All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include "gnix.h" #include "gnix_nic.h" #include "gnix_cm_nic.h" #include "gnix_vc.h" #include "gnix_mbox_allocator.h" #include "gnix_util.h" #include "fi_ext_gni.h" /* * TODO: make this a domain parameter */ #define GNIX_VC_FL_MIN_SIZE 128 #define GNIX_VC_FL_INIT_REFILL_SIZE 10 static int gnix_nics_per_ptag[GNI_PTAG_MAX]; struct dlist_entry gnix_nic_list_ptag[GNI_PTAG_MAX]; DLIST_HEAD(gnix_nic_list); pthread_mutex_t gnix_nic_list_lock = PTHREAD_MUTEX_INITIALIZER; /* * globals */ uint32_t gnix_max_nics_per_ptag = GNIX_DEF_MAX_NICS_PER_PTAG; /* * local variables */ static struct gnix_nic_attr default_attr = { .gni_cdm_hndl = NULL, .gni_nic_hndl = NULL }; /******************************************************************************* * Helper functions. ******************************************************************************/ /* * this function is intended to be invoked as an argument to pthread_create, */ static void *__gnix_nic_prog_thread_fn(void *the_arg) { int ret = FI_SUCCESS, prev_state; int retry = 0; uint32_t which; struct gnix_nic *nic = (struct gnix_nic *)the_arg; sigset_t sigmask; gni_cq_handle_t cqv[2]; gni_return_t status; gni_cq_entry_t cqe; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * temporarily disable cancelability while we set up * some stuff */ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state); /* * help out Cray core-spec, say we're not an app thread * and can be run on core-spec cpus. */ ret = _gnix_task_is_not_app(); if (ret) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_task_is_not_app call returned %d\n", ret); /* * block all signals, don't want this thread to catch * signals that may be for app threads */ memset(&sigmask, 0, sizeof(sigset_t)); ret = sigfillset(&sigmask); if (ret) { GNIX_WARN(FI_LOG_EP_CTRL, "sigfillset call returned %d\n", ret); } else { ret = pthread_sigmask(SIG_SETMASK, &sigmask, NULL); if (ret) GNIX_WARN(FI_LOG_EP_CTRL, "pthread_sigmask call returned %d\n", ret); } /* * okay now we're ready to be cancelable. */ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); cqv[0] = nic->tx_cq_blk; cqv[1] = nic->rx_cq_blk; try_again: status = GNI_CqVectorMonitor(cqv, 2, -1, &which); switch (status) { case GNI_RC_SUCCESS: /* * first dequeue RX CQEs */ if (nic->rx_cq_blk != nic->rx_cq && which == 1) { do { status = GNI_CqGetEvent(nic->rx_cq_blk, &cqe); } while (status == GNI_RC_SUCCESS); } pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state); _gnix_nic_progress(nic); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state); retry = 1; break; case GNI_RC_TIMEOUT: case GNI_RC_NOT_DONE: /* Invalid state indicates call interrupted by signal using various tools */ case GNI_RC_INVALID_STATE: retry = 1; break; case GNI_RC_INVALID_PARAM: case GNI_RC_ERROR_RESOURCE: case GNI_RC_ERROR_NOMEM: retry = 0; GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqGetEvent returned %s\n", gni_err_str[status]); break; default: retry = 0; GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqGetEvent returned unexpected code %s\n", gni_err_str[status]); break; } if (retry) goto try_again; return NULL; } /* * setup memory registration for remote GNI_PostCqWrite's to target */ static int __nic_setup_irq_cq(struct gnix_nic *nic) { int ret = FI_SUCCESS; size_t len; gni_return_t status; int fd = -1; void *mmap_addr; int vmdh_index = -1; int flags = GNI_MEM_READWRITE; struct gnix_auth_key *info; struct fi_gni_auth_key key; len = (size_t)sysconf(_SC_PAGESIZE); mmap_addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, fd, 0); if (mmap_addr == MAP_FAILED) { GNIX_WARN(FI_LOG_EP_CTRL, "mmap failed - %s\n", strerror(errno)); ret = -errno; goto err; } nic->irq_mmap_addr = mmap_addr; nic->irq_mmap_len = len; /* On some systems, the page may not be zero'd from first use. Memset it here */ memset(mmap_addr, 0x0, len); if (nic->using_vmdh) { key.type = GNIX_AKT_RAW; key.raw.protection_key = nic->cookie; info = _gnix_auth_key_lookup((uint8_t *) &key, sizeof(key)); assert(info); if (!nic->mdd_resources_set) { /* check to see if the ptag registration limit was set yet or not -- becomes read-only after success */ ret = _gnix_auth_key_enable(info); if (ret != FI_SUCCESS && ret != -FI_EBUSY) { GNIX_WARN(FI_LOG_DOMAIN, "failed to enable authorization key, " "unexpected error rc=%d\n", ret); } status = GNI_SetMddResources(nic->gni_nic_hndl, (info->attr.prov_key_limit + info->attr.user_key_limit)); if (status != GNI_RC_SUCCESS) { GNIX_FATAL(FI_LOG_DOMAIN, "failed to set MDD resources, rc=%d\n", status); } nic->mdd_resources_set = 1; } vmdh_index = _gnix_get_next_reserved_key(info); if (vmdh_index <= 0) { GNIX_FATAL(FI_LOG_DOMAIN, "failed to get next reserved key, " "rc=%d\n", vmdh_index); } flags |= GNI_MEM_USE_VMDH; } status = GNI_MemRegister(nic->gni_nic_hndl, (uint64_t) nic->irq_mmap_addr, len, nic->rx_cq_blk, flags, vmdh_index, &nic->irq_mem_hndl); if (status != GNI_RC_SUCCESS) { ret = gnixu_to_fi_errno(status); GNIX_WARN(FI_LOG_EP_CTRL, "GNI_MemRegister returned %s\n", gni_err_str[status]); goto err_w_mmap; } #if 0 fprintf(stderr,"registered ireq memhndl 0x%016lx 0x%016lx\n", nic->irq_mem_hndl.qword1, nic->irq_mem_hndl.qword2); #endif return ret; err_w_mmap: munmap(mmap_addr, len); err: return ret; } /* * release resources previously set up for remote * GNI_PostCqWrite's to target */ static int __nic_teardown_irq_cq(struct gnix_nic *nic) { int ret = FI_SUCCESS; gni_return_t status; if (nic == NULL) return ret; if (nic->irq_mmap_addr == NULL) return ret; if ((nic->irq_mem_hndl.qword1) || (nic->irq_mem_hndl.qword2)) { status = GNI_MemDeregister(nic->gni_nic_hndl, &nic->irq_mem_hndl); if (status != GNI_RC_SUCCESS) { ret = gnixu_to_fi_errno(status); GNIX_WARN(FI_LOG_EP_CTRL, "GNI_MemDeregister returned %s\n", gni_err_str[status]); } } munmap(nic->irq_mmap_addr, nic->irq_mmap_len); return ret; } /* * place holder for better attributes checker */ static int __gnix_nic_check_attr_sanity(struct gnix_nic_attr *attr) { return FI_SUCCESS; } static inline struct gnix_tx_descriptor * __desc_lkup_by_id(struct gnix_nic *nic, int desc_id) { struct gnix_tx_descriptor *tx_desc; assert((desc_id >= 0) && (desc_id <= nic->max_tx_desc_id)); tx_desc = &nic->tx_desc_base[desc_id]; return tx_desc; } static int __nic_rx_overrun(struct gnix_nic *nic) { int i, max_id, ret; struct gnix_vc *vc; gni_return_t status; gni_cq_entry_t cqe; GNIX_WARN(FI_LOG_EP_DATA, "\n"); /* clear out the CQ */ /* * TODO: really need to process CQEs better for error reporting, * etc. */ while ((status = GNI_CqGetEvent(nic->rx_cq, &cqe)) == GNI_RC_SUCCESS); assert(status == GNI_RC_NOT_DONE); COND_ACQUIRE(nic->requires_lock, &nic->vc_id_lock); max_id = nic->vc_id_table_count; COND_RELEASE(nic->requires_lock, &nic->vc_id_lock); /* * TODO: optimization would * be to keep track of last time * this happened and where smsg msgs. * were found. */ for (i = 0; i < max_id; i++) { ret = _gnix_test_bit(&nic->vc_id_bitmap, i); if (ret) { vc = __gnix_nic_elem_by_rem_id(nic, i); ret = _gnix_vc_rx_schedule(vc); assert(ret == FI_SUCCESS); } } return FI_SUCCESS; } static int __process_rx_cqe(struct gnix_nic *nic, gni_cq_entry_t cqe) { int ret = FI_SUCCESS, vc_id = 0; struct gnix_vc *vc; vc_id = GNI_CQ_GET_INST_ID(cqe); /* * its possible this vc has been destroyed, so may get NULL * back. */ vc = __gnix_nic_elem_by_rem_id(nic, vc_id); if (vc != NULL) { switch (vc->conn_state) { case GNIX_VC_CONNECTING: GNIX_DEBUG(FI_LOG_EP_DATA, "Scheduling VC for RX processing (%p)\n", vc); ret = _gnix_vc_rx_schedule(vc); assert(ret == FI_SUCCESS); break; case GNIX_VC_CONNECTED: GNIX_DEBUG(FI_LOG_EP_DATA, "Processing VC RX (%p)\n", vc); ret = _gnix_vc_rx_schedule(vc); assert(ret == FI_SUCCESS); break; default: break; /* VC not in a state for scheduling or SMSG processing */ } } return ret; } static int __nic_rx_progress(struct gnix_nic *nic) { int ret = FI_SUCCESS; gni_return_t status = GNI_RC_NOT_DONE; gni_cq_entry_t cqe; status = GNI_CqTestEvent(nic->rx_cq); if (status == GNI_RC_NOT_DONE) return FI_SUCCESS; COND_ACQUIRE(nic->requires_lock, &nic->lock); do { status = GNI_CqGetEvent(nic->rx_cq, &cqe); if (OFI_UNLIKELY(status == GNI_RC_NOT_DONE)) { ret = FI_SUCCESS; break; } if (OFI_LIKELY(status == GNI_RC_SUCCESS)) { /* Find and schedule the associated VC. */ ret = __process_rx_cqe(nic, cqe); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "process_rx_cqe() failed: %d\n", ret); } } else if (status == GNI_RC_ERROR_RESOURCE) { /* The remote CQ was overrun. Events related to any VC * could have been missed. Schedule each VC to be sure * all messages are processed. */ assert(GNI_CQ_OVERRUN(cqe)); __nic_rx_overrun(nic); } else { GNIX_WARN(FI_LOG_EP_DATA, "GNI_CqGetEvent returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); break; } } while (1); COND_RELEASE(nic->requires_lock, &nic->lock); return ret; } void _gnix_nic_txd_err_inject(struct gnix_nic *nic, struct gnix_tx_descriptor *txd) { slist_insert_tail(&txd->err_list, &nic->err_txds); } static int __gnix_nic_txd_err_get(struct gnix_nic *nic, struct gnix_tx_descriptor **txd) { struct slist_entry *list_entry; struct gnix_tx_descriptor *txd_p; list_entry = slist_remove_head(&nic->err_txds); if (list_entry) { txd_p = container_of(list_entry, struct gnix_tx_descriptor, err_list); *txd = txd_p; return 1; } return 0; } static void __nic_get_completed_txd(struct gnix_nic *nic, gni_cq_handle_t hw_cq, struct gnix_tx_descriptor **txd, gni_return_t *tx_status) { gni_post_descriptor_t *gni_desc; struct gnix_tx_descriptor *txd_p = NULL; struct gnix_fab_req *req; gni_return_t status; int msg_id; gni_cq_entry_t cqe; uint32_t recov = 1; if (__gnix_nic_txd_err_get(nic, &txd_p)) { *txd = txd_p; *tx_status = GNI_RC_TRANSACTION_ERROR; return; } status = GNI_CqGetEvent(hw_cq, &cqe); if (status == GNI_RC_NOT_DONE) { *txd = NULL; *tx_status = GNI_RC_NOT_DONE; return; } assert(status == GNI_RC_SUCCESS || status == GNI_RC_TRANSACTION_ERROR); if (OFI_UNLIKELY(status == GNI_RC_TRANSACTION_ERROR)) { status = GNI_CqErrorRecoverable(cqe, &recov); if (status == GNI_RC_SUCCESS) { if (!recov) { char ebuf[512]; GNI_CqErrorStr(cqe, ebuf, sizeof(ebuf)); GNIX_WARN(FI_LOG_EP_DATA, "CQ error status: %s\n", ebuf); } } else { GNIX_WARN(FI_LOG_EP_DATA, "GNI_CqErrorRecover returned: %s\n", gni_err_str[status]); recov = 0; /* assume something bad has happened */ } } if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_POST) { status = GNI_GetCompleted(hw_cq, cqe, &gni_desc); assert(status == GNI_RC_SUCCESS || status == GNI_RC_TRANSACTION_ERROR); txd_p = container_of(gni_desc, struct gnix_tx_descriptor, gni_desc); } else if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_SMSG) { msg_id = GNI_CQ_GET_MSG_ID(cqe); txd_p = __desc_lkup_by_id(nic, msg_id); } if (OFI_UNLIKELY(txd_p == NULL)) GNIX_FATAL(FI_LOG_EP_DATA, "Unexpected CQE: 0x%lx", cqe); /* * set retry count on the request to max to force * delivering error'd CQ event to application */ if (!recov) { status = GNI_RC_TRANSACTION_ERROR; req = txd_p->req; if (req) req->tx_failures = UINT_MAX; } *tx_status = status; *txd = txd_p; } static int __nic_tx_progress(struct gnix_nic *nic, gni_cq_handle_t cq) { int ret = FI_SUCCESS; gni_return_t tx_status; struct gnix_tx_descriptor *txd; do { txd = NULL; COND_ACQUIRE(nic->requires_lock, &nic->lock); __nic_get_completed_txd(nic, cq, &txd, &tx_status); COND_RELEASE(nic->requires_lock, &nic->lock); if (txd && txd->completer_fn) { ret = txd->completer_fn(txd, tx_status); if (ret != FI_SUCCESS) { /* * TODO: need to post error to CQ */ GNIX_WARN(FI_LOG_EP_DATA, "TXD completer failed: %d", ret); } } if ((txd == NULL) || ret != FI_SUCCESS) break; } while (1); return ret; } int _gnix_nic_progress(void *arg) { struct gnix_nic *nic = (struct gnix_nic *)arg; int ret = FI_SUCCESS; ret = __nic_tx_progress(nic, nic->tx_cq); if (OFI_UNLIKELY(ret != FI_SUCCESS)) return ret; if (nic->tx_cq_blk && nic->tx_cq_blk != nic->tx_cq) { ret = __nic_tx_progress(nic, nic->tx_cq_blk); if (OFI_UNLIKELY(ret != FI_SUCCESS)) return ret; } ret = __nic_rx_progress(nic); if (ret != FI_SUCCESS) return ret; ret = _gnix_vc_nic_progress(nic); if (ret != FI_SUCCESS) return ret; return ret; } int _gnix_nic_free_rem_id(struct gnix_nic *nic, int remote_id) { assert(nic); if ((remote_id < 0) || (remote_id > nic->vc_id_table_count)) return -FI_EINVAL; _gnix_clear_bit(&nic->vc_id_bitmap, remote_id); return FI_SUCCESS; } /* * this function is needed to allow for quick lookup of a vc based on * the contents of the GNI CQE coming off of the GNI RX CQ associated * with GNI nic being used by this VC. Using a bitmap to expedite * scanning vc's in the case of a GNI CQ overrun. */ int _gnix_nic_get_rem_id(struct gnix_nic *nic, int *remote_id, void *entry) { int ret = FI_SUCCESS; void **table_base; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * TODO: really need to search bitmap for clear * bit before resizing the table */ COND_ACQUIRE(nic->requires_lock, &nic->vc_id_lock); if (nic->vc_id_table_capacity == nic->vc_id_table_count) { table_base = realloc(nic->vc_id_table, 2 * nic->vc_id_table_capacity * sizeof(void *)); if (table_base == NULL) { ret = -FI_ENOMEM; goto err; } nic->vc_id_table_capacity *= 2; nic->vc_id_table = table_base; ret = _gnix_realloc_bitmap(&nic->vc_id_bitmap, nic->vc_id_table_capacity); if (ret != FI_SUCCESS) { assert(ret == -FI_ENOMEM); goto err; } } nic->vc_id_table[nic->vc_id_table_count] = entry; *remote_id = nic->vc_id_table_count; /* * set bit in the bitmap */ _gnix_set_bit(&nic->vc_id_bitmap, nic->vc_id_table_count); ++(nic->vc_id_table_count); err: COND_RELEASE(nic->requires_lock, &nic->vc_id_lock); return ret; } /* * allocate a free list of tx descs for a gnix_nic struct. */ static int __gnix_nic_tx_freelist_init(struct gnix_nic *nic, int n_descs) { int i, ret = FI_SUCCESS; struct gnix_tx_descriptor *desc_base, *desc_ptr; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * set up free list of tx descriptors. */ desc_base = calloc(n_descs, sizeof(struct gnix_tx_descriptor)); if (desc_base == NULL) { ret = -FI_ENOMEM; goto err; } dlist_init(&nic->tx_desc_free_list); dlist_init(&nic->tx_desc_active_list); for (i = 0, desc_ptr = desc_base; i < n_descs; i++, desc_ptr++) { desc_ptr->id = i; dlist_insert_tail(&desc_ptr->list, &nic->tx_desc_free_list); } nic->max_tx_desc_id = n_descs - 1; nic->tx_desc_base = desc_base; fastlock_init(&nic->tx_desc_lock); return ret; err: return ret; } /* * clean up the tx descs free list */ static void __gnix_nic_tx_freelist_destroy(struct gnix_nic *nic) { GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); free(nic->tx_desc_base); fastlock_destroy(&nic->tx_desc_lock); } /* * free a gnix nic and associated resources if refcnt drops to 0 */ static void __nic_destruct(void *obj) { int ret = FI_SUCCESS; gni_return_t status = GNI_RC_SUCCESS; struct gnix_nic *nic = (struct gnix_nic *) obj; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* Get us out of the progression tables we are destroying the nic * and we don't want the wait progression thread to progress us * after our structures are destroyed. */ pthread_mutex_lock(&gnix_nic_list_lock); dlist_remove(&nic->gnix_nic_list); --gnix_nics_per_ptag[nic->ptag]; dlist_remove(&nic->ptag_nic_list); pthread_mutex_unlock(&gnix_nic_list_lock); __gnix_nic_tx_freelist_destroy(nic); /* *free irq cq related resources */ ret = __nic_teardown_irq_cq(nic); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "__nic_teardown_irq_cq returned %s\n", fi_strerror(-ret)); /* * kill off progress thread, if any */ if (nic->progress_thread) { ret = pthread_cancel(nic->progress_thread); if ((ret != 0) && (ret != ESRCH)) { GNIX_WARN(FI_LOG_EP_CTRL, "pthread_cancel returned %d\n", ret); goto err; } ret = pthread_join(nic->progress_thread, NULL); if ((ret != 0) && (ret != ESRCH)) { GNIX_WARN(FI_LOG_EP_CTRL, "pthread_join returned %d\n", ret); goto err; } GNIX_INFO(FI_LOG_EP_CTRL, "pthread_join returned %d\n", ret); nic->progress_thread = 0; } /* Must free mboxes first, because the MR has a pointer to the * nic handles below */ ret = _gnix_mbox_allocator_destroy(nic->mbox_hndl); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_allocator_destroy returned %s\n", fi_strerror(-ret)); /* * see comments in the nic constructor about why * the following code section is currently stubbed out. */ #if 0 ret = _gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_allocator_destroy returned %s\n", fi_strerror(-ret)); ret = _gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_allocator_destroy returned %s\n", fi_strerror(-ret)); #endif if (!nic->gni_cdm_hndl) { GNIX_WARN(FI_LOG_EP_CTRL, "No CDM attached to nic, nic=%p"); } assert(nic->gni_cdm_hndl != NULL); if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk) { status = GNI_CqDestroy(nic->rx_cq); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqDestroy returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err; } } if (nic->rx_cq_blk != NULL) { status = GNI_CqDestroy(nic->rx_cq_blk); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqDestroy returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err; } } if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk) { status = GNI_CqDestroy(nic->tx_cq); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqDestroy returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err; } } if (nic->tx_cq_blk != NULL) { status = GNI_CqDestroy(nic->tx_cq_blk); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqDestroy returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err; } } if (nic->allocd_gni_res & GNIX_NIC_CDM_ALLOCD) { status = GNI_CdmDestroy(nic->gni_cdm_hndl); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmDestroy returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err; } } if (nic->vc_id_table != NULL) { free(nic->vc_id_table); } else { GNIX_WARN(FI_LOG_EP_CTRL, "vc_id_table was NULL\n"); } /* * destroy VC free list associated with this nic */ _gnix_fl_destroy(&nic->vc_freelist); /* * remove the nic from the linked lists * for the domain and the global nic list */ err: _gnix_free_bitmap(&nic->vc_id_bitmap); free(nic); } int _gnix_nic_free(struct gnix_nic *nic) { GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); if (nic == NULL) return -FI_EINVAL; _gnix_ref_put(nic); return FI_SUCCESS; } /* * allocate a gnix_nic struct using attributes of the domain */ int gnix_nic_alloc(struct gnix_fid_domain *domain, struct gnix_nic_attr *attr, struct gnix_nic **nic_ptr) { int ret = FI_SUCCESS; struct gnix_nic *nic = NULL; uint32_t device_addr; gni_return_t status; uint32_t fake_cdm_id = GNIX_CREATE_CDM_ID; gni_smsg_attr_t smsg_mbox_attr; struct gnix_nic_attr *nic_attr = &default_attr; uint32_t num_corespec_cpus = 0; bool must_alloc_nic = false; bool free_list_inited = false; struct gnix_auth_key *auth_key; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); *nic_ptr = NULL; nic_attr->gni_cdm_modes = gnix_cdm_modes; if (attr) { ret = __gnix_nic_check_attr_sanity(attr); if (ret != FI_SUCCESS) return ret; nic_attr = attr; must_alloc_nic = nic_attr->must_alloc; } auth_key = nic_attr->auth_key; /* * If we've maxed out the number of nics for this domain/ptag, * search the list of existing nics. Take the gnix_nic_list_lock * here since the gnix_nic_list will be manipulated whether or * not we attach to an existing nic or create a new one. * * Should not matter much that this is a pretty fat critical section * since endpoint setup for RDM type will typically occur near * app startup, likely in a single threaded region, and for the * case of MSG, where there will likely be many 100s of EPs, after * a few initial slow times through this section when nics are created, * max nic count for the ptag will be reached and only the first part * of the critical section - iteration over existing nics - will be * happening. */ pthread_mutex_lock(&gnix_nic_list_lock); /* * we can reuse previously allocated nics as long as a * must_alloc is not specified in the nic_attr arg. */ if ((must_alloc_nic == false) && (gnix_nics_per_ptag[auth_key->ptag] >= gnix_max_nics_per_ptag)) { assert(!dlist_empty(&gnix_nic_list_ptag[auth_key->ptag])); nic = dlist_first_entry(&gnix_nic_list_ptag[auth_key->ptag], struct gnix_nic, ptag_nic_list); dlist_remove(&nic->ptag_nic_list); dlist_insert_tail(&nic->ptag_nic_list, &gnix_nic_list_ptag[auth_key->ptag]); _gnix_ref_get(nic); GNIX_INFO(FI_LOG_EP_CTRL, "Reusing NIC:%p\n", nic); } /* * no nic found create a cdm and attach */ if (!nic) { nic = calloc(1, sizeof(struct gnix_nic)); if (nic == NULL) { ret = -FI_ENOMEM; goto err; } nic->using_vmdh = domain->using_vmdh; if (nic_attr->use_cdm_id == false) { ret = _gnix_cm_nic_create_cdm_id(domain, &fake_cdm_id); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_cm_nic_create_cdm_id returned %s\n", fi_strerror(-ret)); goto err; } } else fake_cdm_id = nic_attr->cdm_id; if (nic_attr->gni_cdm_hndl == NULL) { status = GNI_CdmCreate(fake_cdm_id, auth_key->ptag, auth_key->cookie, gnix_cdm_modes, &nic->gni_cdm_hndl); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmCreate returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err1; } nic->allocd_gni_res |= GNIX_NIC_CDM_ALLOCD; } else { nic->gni_cdm_hndl = nic_attr->gni_cdm_hndl; } /* * Okay, now go for the attach */ if (nic_attr->gni_nic_hndl == NULL) { status = GNI_CdmAttach(nic->gni_cdm_hndl, 0, &device_addr, &nic->gni_nic_hndl); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmAttach returned %s\n", gni_err_str[status]); _gnix_dump_gni_res(auth_key->ptag); ret = gnixu_to_fi_errno(status); goto err1; } } else nic->gni_nic_hndl = nic_attr->gni_nic_hndl; /* * create TX CQs - first polling, then blocking */ status = GNI_CqCreate(nic->gni_nic_hndl, domain->params.tx_cq_size, 0, /* no delay count */ GNI_CQ_BLOCKING | domain->gni_cq_modes, NULL, /* useless handler */ NULL, /* useless handler context */ &nic->tx_cq_blk); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqCreate returned %s\n", gni_err_str[status]); _gnix_dump_gni_res(auth_key->ptag); ret = gnixu_to_fi_errno(status); goto err1; } /* Use blocking CQs for all operations if eager_auto_progress * is used. */ if (domain->params.eager_auto_progress) { nic->tx_cq = nic->tx_cq_blk; } else { status = GNI_CqCreate(nic->gni_nic_hndl, domain->params.tx_cq_size, 0, /* no delay count */ domain->gni_cq_modes, NULL, /* useless handler */ NULL, /* useless handler ctx */ &nic->tx_cq); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqCreate returned %s\n", gni_err_str[status]); _gnix_dump_gni_res(auth_key->ptag); ret = gnixu_to_fi_errno(status); goto err1; } } /* * create RX CQs - first polling, then blocking */ status = GNI_CqCreate(nic->gni_nic_hndl, domain->params.rx_cq_size, 0, GNI_CQ_BLOCKING | domain->gni_cq_modes, NULL, NULL, &nic->rx_cq_blk); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqCreate returned %s\n", gni_err_str[status]); _gnix_dump_gni_res(auth_key->ptag); ret = gnixu_to_fi_errno(status); goto err1; } /* Use blocking CQs for all operations if eager_auto_progress * is used. */ if (domain->params.eager_auto_progress) { nic->rx_cq = nic->rx_cq_blk; } else { status = GNI_CqCreate(nic->gni_nic_hndl, domain->params.rx_cq_size, 0, domain->gni_cq_modes, NULL, NULL, &nic->rx_cq); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CqCreate returned %s\n", gni_err_str[status]); _gnix_dump_gni_res(auth_key->ptag); ret = gnixu_to_fi_errno(status); goto err1; } } nic->device_addr = device_addr; nic->ptag = auth_key->ptag; nic->cookie = auth_key->cookie; nic->vc_id_table_capacity = domain->params.vc_id_table_capacity; nic->vc_id_table = malloc(sizeof(void *) * nic->vc_id_table_capacity); if (nic->vc_id_table == NULL) { GNIX_WARN(FI_LOG_EP_CTRL, "malloc of vc_id_table failed\n"); ret = -FI_ENOMEM; goto err1; } ret = _gnix_alloc_bitmap(&nic->vc_id_bitmap, nic->vc_id_table_capacity, NULL); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "alloc_bitmap returned %d\n", ret); goto err1; } fastlock_init(&nic->vc_id_lock); /* * initialize free list for VC's * In addition to hopefully allowing for a more compact * allocation of VC structs, the free list is also import * because there is a window of time when using auto progress * that a thread may be going through the progress engine * while one of the application threads is actively tearing * down an endpoint (and hence its associated VCs) before the * rem_id for the vc is removed from the vector. * As a consequence, it is important that * the memory allocated within the freelist allocator not be * returned to the system prior to the freelist being destroyed * as part of the nic destructor procedure. The freelist is * destroyed in that procedure after the progress thread * has been joined. */ ret = _gnix_fl_init_ts(sizeof(struct gnix_vc), offsetof(struct gnix_vc, fr_list), GNIX_VC_FL_MIN_SIZE, GNIX_VC_FL_INIT_REFILL_SIZE, 0, 0, &nic->vc_freelist); if (ret == FI_SUCCESS) { free_list_inited = true; } else { GNIX_DEBUG(FI_LOG_EP_DATA, "_gnix_fl_init returned: %s\n", fi_strerror(-ret)); goto err1; } fastlock_init(&nic->lock); ret = __gnix_nic_tx_freelist_init(nic, domain->params.tx_cq_size); if (ret != FI_SUCCESS) goto err1; fastlock_init(&nic->prog_vcs_lock); dlist_init(&nic->prog_vcs); _gnix_ref_init(&nic->ref_cnt, 1, __nic_destruct); smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_mbox_attr.mbox_maxcredit = domain->params.mbox_maxcredit; smsg_mbox_attr.msg_maxsize = domain->params.mbox_msg_maxsize; status = GNI_SmsgBufferSizeNeeded(&smsg_mbox_attr, &nic->mem_per_mbox); if (status != GNI_RC_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_SmsgBufferSizeNeeded returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); goto err1; } /* * set up mailbox allocator for SMSG mailboxes */ ret = _gnix_mbox_allocator_create(nic, nic->rx_cq, domain->params.mbox_page_size, (size_t)nic->mem_per_mbox, domain->params.mbox_num_per_slab, &nic->mbox_hndl); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_alloc returned %s\n", fi_strerror(-ret)); goto err1; } /* * use the mailbox allocator system to set up an * pre-pinned RDMA bounce buffers for longer eager * messages and other cases where zero-copy * can't be safely used. * * One set of blocks is used for the send side. * A second set of blocks is used for the receive * side. Both sets of blocks are registered against * the blocking RX CQ for this nic. * * TODO: hardwired constants, uff * TODO: better to use a buddy allocator or some other * allocator * Disable these for now as we're not using and they * chew up a lot of IOMMU space per nic. */ #if 0 ret = _gnix_mbox_allocator_create(nic, NULL, GNIX_PAGE_2MB, 65536, 512, &nic->s_rdma_buf_hndl); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_alloc returned %s\n", fi_strerror(-ret)); _gnix_dump_gni_res(domain->ptag); goto err1; } ret = _gnix_mbox_allocator_create(nic, NULL, GNIX_PAGE_2MB, 65536, 512, &nic->r_rdma_buf_hndl); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_alloc returned %s\n", fi_strerror(-ret)); _gnix_dump_gni_res(domain->ptag); goto err1; } #endif ret = __nic_setup_irq_cq(nic); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "__nic_setup_irq_cq returned %s\n", fi_strerror(-ret)); _gnix_dump_gni_res(auth_key->ptag); goto err1; } /* * if the domain is using PROGRESS_AUTO for data, set up * a progress thread. */ if (domain->data_progress == FI_PROGRESS_AUTO) { /* * tell CLE job container that next thread should be * runnable anywhere in the cpuset, don't treat as * an error if one is returned, may have perf issues * though... */ ret = _gnix_get_num_corespec_cpus(&num_corespec_cpus); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "failed to get num corespec cpus\n"); } if (num_corespec_cpus > 0) { ret = _gnix_job_disable_affinity_apply(); } else { ret = _gnix_job_enable_unassigned_cpus(); } if (ret != 0) GNIX_WARN(FI_LOG_EP_CTRL, "job_disable/unassigned cpus returned %d\n", ret); ret = pthread_create(&nic->progress_thread, NULL, __gnix_nic_prog_thread_fn, (void *)nic); if (ret) GNIX_WARN(FI_LOG_EP_CTRL, "pthread_create call returned %d\n", ret); } dlist_insert_tail(&nic->gnix_nic_list, &gnix_nic_list); dlist_insert_tail(&nic->ptag_nic_list, &gnix_nic_list_ptag[auth_key->ptag]); nic->smsg_callbacks = gnix_ep_smsg_callbacks; ++gnix_nics_per_ptag[auth_key->ptag]; GNIX_INFO(FI_LOG_EP_CTRL, "Allocated NIC:%p\n", nic); } if (nic) { nic->requires_lock = domain->thread_model != FI_THREAD_COMPLETION; nic->using_vmdh = domain->using_vmdh; } *nic_ptr = nic; goto out; err1: ofi_atomic_dec32(&gnix_id_counter); err: if (nic != NULL) { __nic_teardown_irq_cq(nic); if (nic->r_rdma_buf_hndl != NULL) _gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl); if (nic->s_rdma_buf_hndl != NULL) _gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl); if (nic->mbox_hndl != NULL) _gnix_mbox_allocator_destroy(nic->mbox_hndl); if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk) GNI_CqDestroy(nic->rx_cq); if (nic->rx_cq_blk != NULL) GNI_CqDestroy(nic->rx_cq_blk); if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk) GNI_CqDestroy(nic->tx_cq); if (nic->tx_cq_blk != NULL) GNI_CqDestroy(nic->tx_cq_blk); if ((nic->gni_cdm_hndl != NULL) && (nic->allocd_gni_res & GNIX_NIC_CDM_ALLOCD)) GNI_CdmDestroy(nic->gni_cdm_hndl); if (free_list_inited == true) _gnix_fl_destroy(&nic->vc_freelist); free(nic); } out: pthread_mutex_unlock(&gnix_nic_list_lock); return ret; } void _gnix_nic_init(void) { int i, rc; for (i = 0; i < GNI_PTAG_MAX; i++) { dlist_init(&gnix_nic_list_ptag[i]); } rc = _gnix_nics_per_rank(&gnix_max_nics_per_ptag); if (rc == FI_SUCCESS) { GNIX_DEBUG(FI_LOG_FABRIC, "gnix_max_nics_per_ptag: %u\n", gnix_max_nics_per_ptag); } else { GNIX_WARN(FI_LOG_FABRIC, "_gnix_nics_per_rank failed: %d\n", rc); } if (getenv("GNIX_MAX_NICS") != NULL) gnix_max_nics_per_ptag = atoi(getenv("GNIX_MAX_NICS")); /* * Well if we didn't get 1 nic, that means we must really be doing * FMA sharing. */ if (gnix_max_nics_per_ptag == 0) { gnix_max_nics_per_ptag = 1; GNIX_WARN(FI_LOG_FABRIC, "Using inter-procss FMA sharing\n"); } }