/*
 * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "psmx.h"
#include "ofi_prov.h"
#include <glob.h>

static int psmx_init_count = 0;
static int psmx_lib_initialized = 0;
static pthread_mutex_t psmx_lib_mutex; 
static int psmx_compat_lib = 0;

struct psmx_env psmx_env = {
	.name_server	= 1,
	.am_msg		= 0,
	.tagged_rma	= 1,
	.uuid		= PSMX_DEFAULT_UUID,
	.delay		= 1,
	.timeout	= 5,
	.prog_thread	= 1,
	.prog_interval	= -1,
	.prog_affinity	= NULL,
};

static void psmx_init_env(void)
{
	if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK") || getenv("PMIX_RANK"))
		psmx_env.name_server = 0;

	fi_param_get_bool(&psmx_prov, "name_server", &psmx_env.name_server);
	fi_param_get_bool(&psmx_prov, "am_msg", &psmx_env.am_msg);
	fi_param_get_bool(&psmx_prov, "tagged_rma", &psmx_env.tagged_rma);
	fi_param_get_str(&psmx_prov, "uuid", &psmx_env.uuid);
	fi_param_get_int(&psmx_prov, "delay", &psmx_env.delay);
	fi_param_get_int(&psmx_prov, "timeout", &psmx_env.timeout);
	fi_param_get_int(&psmx_prov, "prog_thread", &psmx_env.prog_thread);
	fi_param_get_int(&psmx_prov, "prog_interval", &psmx_env.prog_interval);
	fi_param_get_str(&psmx_prov, "prog_affinity", &psmx_env.prog_affinity);
}

static int psmx_init_lib(void)
{
	int major, minor;
	int ret = 0, err;

	if (psmx_lib_initialized)
		return 0;

	pthread_mutex_lock(&psmx_lib_mutex);

	if (psmx_lib_initialized)
		goto out;

	psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);

	major = PSM_VERNO_MAJOR;
	minor = PSM_VERNO_MINOR;

	err = psm_init(&major, &minor);
	if (err != PSM_OK) {
		FI_WARN(&psmx_prov, FI_LOG_CORE,
			"psm_init failed: %s\n", psm_error_get_string(err));
		ret = err;
		goto out;
	}

	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM header version = (%d, %d)\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM library version = (%d, %d)\n", major, minor);

	if (major != PSM_VERNO_MAJOR) {
		psmx_am_compat_mode = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM AM compat mode enabled: appliation %d.%d, library %d.%d.\n",
			PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor);
	}

	if (major > 1) {
		psmx_compat_lib = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM is supported via the psm2-compat library over PSM2.\n");
	}

	psmx_lib_initialized = 1;

out:
	pthread_mutex_unlock(&psmx_lib_mutex);
	return ret;
}

static int psmx_reserve_tag_bits(int *caps, uint64_t *max_tag_value)
{
	uint64_t reserved_bits = 0;
	int ret_caps;
	int ask_caps = *caps;

	ret_caps = ask_caps ? ask_caps : PSMX_CAPS;

	if ((ret_caps & FI_MSG) && !psmx_env.am_msg) {
		if (*max_tag_value < PSMX_MSG_BIT) {
			reserved_bits |= PSMX_MSG_BIT;
		} else if (ask_caps) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"unable to reserve tag bit for FI_MSG support.\n"
				"ADVICE: please reduce the asked max_tag_value, "
				"or remove FI_MSG from the asked capabilities, "
				"or set FI_PSM_AM_MSG=1 to use an alternative (but "
				"less optimized) message queue implementation.\n");
			return -1;
		} else {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"unable to reserve tag bit for FI_MSG support. "
				"FI_MSG is removed from the capabilities.\n"
				"ADVICE: please reduce the asked max_tag_value, "
				"or set FI_PSM_AM_MSG=1 to use an alternative (but "
				"less optimized) message queue implementation.\n");
			ret_caps &= ~FI_MSG;
		}
	}

	if ((ret_caps & FI_RMA) && psmx_env.tagged_rma) {
		if (*max_tag_value < PSMX_RMA_BIT) {
			reserved_bits |= PSMX_RMA_BIT;
		} else if (ask_caps) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"unable to reserve tag bit for tagged RMA acceleration.\n"
				"ADVICE: please reduce the asked max_tag_value, or "
				"remove FI_RMA from the asked capabilities, or set "
				"FI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n");
			return -1;
		} else {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"unable to reserve tag bit for tagged RMA acceleration. "
				"FI_RMA is removed from the capabilities.\n"
				"ADVICE: please reduce the asked max_tag_value, or "
				"set FI_PSM_TAGGED_RMA=0 to disable RMA acceleration.\n");
			ret_caps &= ~FI_RMA;
		}
	}

	reserved_bits |= (reserved_bits << 1);

	*caps = ret_caps;
	*max_tag_value = ~reserved_bits;
	return 0;
}

#define PSMX_INFO_DIFF(description, requested, supported, type) \
	do { \
		FI_INFO(&psmx_prov, FI_LOG_CORE, "%s: requested=%s\n", \
			(description), fi_tostr(&(requested), (type))); \
		FI_INFO(&psmx_prov, FI_LOG_CORE, "%s: supported=%s\n", \
			(description), fi_tostr(&(supported), (type))); \
	} while (0)

static int psmx_getinfo(uint32_t version, const char *node, const char *service,
			uint64_t flags, const struct fi_info *hints,
			struct fi_info **info)
{
	struct fi_info *psmx_info;
	uint32_t cnt = 0;
	psm_epid_t *dest_addr = NULL;
	struct psmx_src_name *src_addr = NULL;
	int ep_type = FI_EP_RDM;
	int av_type = FI_AV_UNSPEC;
	uint64_t mode = FI_CONTEXT;
	enum fi_mr_mode mr_mode = FI_MR_SCALABLE;
	enum fi_threading threading = FI_THREAD_COMPLETION;
	enum fi_progress control_progress = FI_PROGRESS_MANUAL;
	enum fi_progress data_progress = FI_PROGRESS_MANUAL;
	int caps = 0;
	uint64_t max_tag_value = 0;
	int err = -FI_ENODATA;
	int svc0, svc = PSMX_ANY_SERVICE;

	FI_INFO(&psmx_prov, FI_LOG_CORE,"\n");

	*info = NULL;

	/* Perform some quick check first to avoid unnecessary operations */
	if (hints) {
		if (hints->fabric_attr && hints->fabric_attr->name &&
		    strcasecmp(hints->fabric_attr->name, PSMX_FABRIC_NAME)) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"hints->fabric_name=%s, supported=%s\n",
				hints->fabric_attr->name, PSMX_FABRIC_NAME);
			goto err_out;
		}

		if (hints->domain_attr && hints->domain_attr->name &&
		    strcasecmp(hints->domain_attr->name, PSMX_DOMAIN_NAME)) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"hints->domain_name=%s, supported=%s\n",
				hints->domain_attr->name, PSMX_DOMAIN_NAME);
			goto err_out;
		}

		if (hints->ep_attr) {
			switch (hints->ep_attr->type) {
			case FI_EP_UNSPEC:
			case FI_EP_DGRAM:
			case FI_EP_RDM:
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->ep_attr->type=%d, supported=%d,%d,%d.\n",
					hints->ep_attr->type, FI_EP_UNSPEC,
					FI_EP_DGRAM, FI_EP_RDM);
				goto err_out;
			}
		}

		if ((hints->caps & PSMX_CAPS) != hints->caps &&
		    (hints->caps & PSMX_CAPS2) != hints->caps) {
			uint64_t psmx_caps = PSMX_CAPS;
			uint64_t psmx_caps2 = PSMX_CAPS2;
			PSMX_INFO_DIFF("hints->caps", hints->caps, psmx_caps,
				       FI_TYPE_CAPS);
			PSMX_INFO_DIFF("alternatively, hints->caps",
				       hints->caps, psmx_caps2, FI_TYPE_CAPS);
			goto err_out;
		}
	}

	if (FI_VERSION_GE(version, FI_VERSION(1,5)))
		mr_mode = 0;

	if (psmx_init_lib())
		return -FI_ENODATA;

	if (psmx_compat_lib) {
		/*
		 * native PSM running over TrueScale doesn't have the issue handled
		 * here. it's only present when PSM is supported via the psm2-compat
		 * library, where the PSM functions are just wrappers around the PSM2
		 * counterparts.
		 *
		 * psm2_ep_num_devunits() may wait for 15 seconds before return
		 * when /dev/hfi1_0 is not present. Check the existence of any hfi1
		 * device interface first to avoid this delay. Note that the devices
		 * don't necessarily appear consecutively so we need to check all
		 * possible device names before returning "no device found" error.
		 * This also means if "/dev/hfi1_0" doesn't exist but other devices
		 * exist, we are still going to see the delay; but that's a rare case.
		 */
		glob_t glob_buf;

		if ((glob("/dev/hfi1_[0-9]", 0, NULL, &glob_buf) != 0) &&
		    (glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &glob_buf) != 0)) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"no hfi1 device is found.\n");
			return -FI_ENODATA;
		}
		globfree(&glob_buf);
	}

	if (psm_ep_num_devunits(&cnt) || !cnt) {
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"no PSM device is found.\n");
		return -FI_ENODATA;
	}

	src_addr = calloc(1, sizeof(*src_addr));
	if (!src_addr) {
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"failed to allocate src addr.\n");
		return -FI_ENODATA;
	}
	src_addr->signature = 0xFFFF;
	src_addr->unit = PSMX_DEFAULT_UNIT;
	src_addr->port = PSMX_DEFAULT_PORT;
	src_addr->service = PSMX_ANY_SERVICE;

	if (flags & FI_SOURCE) {
		if (node)
			sscanf(node, "%*[^:]:%" SCNi8 ":%" SCNu8, &src_addr->unit, &src_addr->port);
		if (service)
			sscanf(service, "%" SCNu32, &src_addr->service);
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"node '%s' service '%s' converted to <unit=%d, port=%d, service=%d>\n",
			node, service, src_addr->unit, src_addr->port, src_addr->service);
	} else if (node) {
		psm_uuid_t uuid;
		psmx_get_uuid(uuid);

		struct util_ns ns = {
			.port = psmx_uuid_to_port(uuid),
			.name_len = sizeof(*dest_addr),
			.service_len = sizeof(svc),
			.service_cmp = psmx_ns_service_cmp,
			.is_service_wildcard = psmx_ns_is_service_wildcard,
		};
		ofi_ns_init(&ns);

		if (service)
			svc = atoi(service);
		svc0 = svc;
		dest_addr = (psm_epid_t *)ofi_ns_resolve_name(&ns, node, &svc);
		if (dest_addr) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"'%s:%u' resolved to <epid=%"PRIu64">:%u\n",
				node, svc0, *dest_addr, svc);
		} else {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"failed to resolve '%s:%u'.\n", node, svc);
			err = -FI_ENODATA;
			goto err_out;
		}
	}

	if (hints) {
		switch (hints->addr_format) {
		case FI_FORMAT_UNSPEC:
		case FI_ADDR_PSMX:
			break;
		default:
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"hints->addr_format=%d, supported=%d,%d.\n",
				hints->addr_format, FI_FORMAT_UNSPEC, FI_ADDR_PSMX);
			goto err_out;
		}

		if (hints->ep_attr) {
			switch (hints->ep_attr->protocol) {
			case FI_PROTO_UNSPEC:
			case FI_PROTO_PSMX:
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->protocol=%d, supported=%d %d\n",
					hints->ep_attr->protocol,
					FI_PROTO_UNSPEC, FI_PROTO_PSMX);
				goto err_out;
			}

			if (hints->ep_attr->tx_ctx_cnt > 1 &&
			    hints->ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->ep_attr->tx_ctx_cnt=%"PRIu64", supported=0,1\n",
					hints->ep_attr->tx_ctx_cnt);
				goto err_out;
			}

			if (hints->ep_attr->rx_ctx_cnt > 1) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->ep_attr->rx_ctx_cnt=%"PRIu64", supported=0,1\n",
					hints->ep_attr->rx_ctx_cnt);
				goto err_out;
			}
		}

		if (hints->tx_attr) {
			if ((hints->tx_attr->op_flags & PSMX_OP_FLAGS) !=
			    hints->tx_attr->op_flags) {
				uint64_t psmx_op_flags = PSMX_OP_FLAGS;
				PSMX_INFO_DIFF("hints->tx_attr->of_flags",
					       hints->tx_attr->op_flags,
					       psmx_op_flags, FI_TYPE_OP_FLAGS);
				goto err_out;
			}
			if (hints->tx_attr->inject_size > PSMX_INJECT_SIZE) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->tx_attr->inject_size=%"PRIu64","
					"supported=%d.\n",
					hints->tx_attr->inject_size,
					PSMX_INJECT_SIZE);
				goto err_out;
			}
		}

		if (hints->rx_attr &&
		    (hints->rx_attr->op_flags & PSMX_OP_FLAGS) !=
		     hints->rx_attr->op_flags) {
			uint64_t psmx_op_flags = PSMX_OP_FLAGS;
			PSMX_INFO_DIFF("hints->rx_attr->of_flags",
				       hints->rx_attr->op_flags,
				       psmx_op_flags, FI_TYPE_OP_FLAGS);
			goto err_out;
		}

		if ((hints->caps & FI_TAGGED) ||
		    ((hints->caps & FI_MSG) && !psmx_env.am_msg)) {
			if ((hints->mode & FI_CONTEXT) != FI_CONTEXT) {
				uint64_t psmx_mode = FI_CONTEXT;
				PSMX_INFO_DIFF("hints->mode", hints->mode,
					       psmx_mode, FI_TYPE_MODE);
				goto err_out;
			}
		} else {
			mode = 0;
		}

		if (hints->domain_attr) {
			switch (hints->domain_attr->av_type) {
			case FI_AV_UNSPEC:
			case FI_AV_MAP:
			case FI_AV_TABLE:
				av_type = hints->domain_attr->av_type;
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->av_type=%d, supported=%d %d %d\n",
					hints->domain_attr->av_type, FI_AV_UNSPEC, FI_AV_MAP,
					FI_AV_TABLE);
				goto err_out;
			}

			if (hints->domain_attr->mr_mode == FI_MR_BASIC) {
				mr_mode = FI_MR_BASIC;
			} else if (hints->domain_attr->mr_mode == FI_MR_SCALABLE) {
				mr_mode = FI_MR_SCALABLE;
			} else if (hints->domain_attr->mr_mode & (FI_MR_BASIC | FI_MR_SCALABLE)) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->mr_mode has FI_MR_BASIC or FI_MR_SCALABLE "
					"combined with other bits\n");
				goto err_out;
			}

			switch (hints->domain_attr->threading) {
			case FI_THREAD_UNSPEC:
				break;
			case FI_THREAD_FID:
			case FI_THREAD_ENDPOINT:
			case FI_THREAD_COMPLETION:
			case FI_THREAD_DOMAIN:
				threading = hints->domain_attr->threading;
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->threading=%d, supported=%d %d %d %d %d\n",
					hints->domain_attr->threading, FI_THREAD_UNSPEC,
					FI_THREAD_FID, FI_THREAD_ENDPOINT, FI_THREAD_COMPLETION,
					FI_THREAD_DOMAIN);
				goto err_out;
			}

			switch (hints->domain_attr->control_progress) {
			case FI_PROGRESS_UNSPEC:
				break;
			case FI_PROGRESS_MANUAL:
			case FI_PROGRESS_AUTO:
				control_progress = hints->domain_attr->control_progress;
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->control_progress=%d, supported=%d %d %d\n",
					hints->domain_attr->control_progress, FI_PROGRESS_UNSPEC,
					FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO);
				goto err_out;
			}

			switch (hints->domain_attr->data_progress) {
			case FI_PROGRESS_UNSPEC:
				break;
			case FI_PROGRESS_MANUAL:
			case FI_PROGRESS_AUTO:
				data_progress = hints->domain_attr->data_progress;
				break;
			default:
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->data_progress=%d, supported=%d %d %d\n",
					hints->domain_attr->data_progress, FI_PROGRESS_UNSPEC,
					FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO);
				goto err_out;
			}

			if (hints->domain_attr->caps & FI_SHARED_AV) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->domain_attr->caps=%lx, shared AV is unsupported\n",
					hints->domain_attr->caps);
				goto err_out;
			}
		}

		if (hints->ep_attr) {
			if (hints->ep_attr->max_msg_size > PSMX_MAX_MSG_SIZE) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->ep_attr->max_msg_size=%"PRIu64","
					"supported=%llu.\n",
					hints->ep_attr->max_msg_size,
					PSMX_MAX_MSG_SIZE);
				goto err_out;
			}
			max_tag_value = ofi_max_tag(hints->ep_attr->mem_tag_format);
		}

		if (hints->tx_attr) {
			if ((hints->tx_attr->msg_order & PSMX_MSG_ORDER) !=
			    hints->tx_attr->msg_order) {
				uint64_t psmx_msg_order = PSMX_MSG_ORDER;
				PSMX_INFO_DIFF("hints->tx_attr->msg_order",
					       hints->tx_attr->msg_order,
					       psmx_msg_order, FI_TYPE_MSG_ORDER);
				goto err_out;
			}
			if ((hints->tx_attr->comp_order & PSMX_COMP_ORDER) !=
			    hints->tx_attr->comp_order) {
				uint64_t psmx_comp_order = PSMX_COMP_ORDER;
				PSMX_INFO_DIFF("hints->tx_attr->comp_order",
					       hints->tx_attr->comp_order,
					       psmx_comp_order, FI_TYPE_MSG_ORDER);
				goto err_out;
			}
			if (hints->tx_attr->inject_size > PSMX_INJECT_SIZE) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->tx_attr->inject_size=%ld,"
					"supported=%d.\n",
					hints->tx_attr->inject_size,
					PSMX_INJECT_SIZE);
				goto err_out;
			}
			if (hints->tx_attr->iov_limit > 1) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->tx_attr->iov_limit=%ld,"
					"supported=1.\n",
					hints->tx_attr->iov_limit);
				goto err_out;
			}
			if (hints->tx_attr->rma_iov_limit > 1) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->tx_attr->rma_iov_limit=%ld,"
					"supported=1.\n",
					hints->tx_attr->rma_iov_limit);
				goto err_out;
			}
		}

		if (hints->rx_attr) {
			if ((hints->rx_attr->msg_order & PSMX_MSG_ORDER) !=
			    hints->rx_attr->msg_order) {
				uint64_t psmx_msg_order = PSMX_MSG_ORDER;
				PSMX_INFO_DIFF("hints->rx_attr->msg_order",
					       hints->rx_attr->msg_order,
					       psmx_msg_order, FI_TYPE_MSG_ORDER);
				goto err_out;
			}
			if ((hints->rx_attr->comp_order & PSMX_COMP_ORDER) !=
			    hints->rx_attr->comp_order) {
				uint64_t psmx_comp_order = PSMX_COMP_ORDER;
				PSMX_INFO_DIFF("hints->rx_attr->comp_order",
					       hints->rx_attr->comp_order,
					       psmx_comp_order, FI_TYPE_MSG_ORDER);
				goto err_out;
			}
			if (hints->rx_attr->iov_limit > 1) {
				FI_INFO(&psmx_prov, FI_LOG_CORE,
					"hints->rx_attr->iov_limit=%ld,"
					"supported=1.\n",
					hints->rx_attr->iov_limit);
				goto err_out;
			}
		}

		caps = hints->caps;

		/* TODO: check other fields of hints */
	}

	if (psmx_reserve_tag_bits(&caps, &max_tag_value) < 0)
		goto err_out;

	psmx_info = fi_allocinfo();
	if (!psmx_info) {
		err = -FI_ENOMEM;
		goto err_out;
	}

	psmx_info->ep_attr->type = ep_type;
	psmx_info->ep_attr->protocol = FI_PROTO_PSMX;
	psmx_info->ep_attr->protocol_version = PSM_VERNO;
	psmx_info->ep_attr->max_msg_size = PSMX_MAX_MSG_SIZE;
	psmx_info->ep_attr->max_order_raw_size = PSMX_RMA_ORDER_SIZE;
	psmx_info->ep_attr->max_order_war_size = PSMX_RMA_ORDER_SIZE;
	psmx_info->ep_attr->max_order_waw_size = PSMX_RMA_ORDER_SIZE;
	psmx_info->ep_attr->mem_tag_format = ofi_tag_format(max_tag_value);
	psmx_info->ep_attr->tx_ctx_cnt = 1;
	psmx_info->ep_attr->rx_ctx_cnt = 1;

	psmx_info->domain_attr->threading = threading;
	psmx_info->domain_attr->control_progress = control_progress;
	psmx_info->domain_attr->data_progress = data_progress;
	psmx_info->domain_attr->name = strdup(PSMX_DOMAIN_NAME);
	psmx_info->domain_attr->resource_mgmt = FI_RM_ENABLED;
	psmx_info->domain_attr->av_type = av_type;
	psmx_info->domain_attr->mr_mode = mr_mode;
	psmx_info->domain_attr->mr_key_size = sizeof(uint64_t);
	psmx_info->domain_attr->cq_data_size = 4;
	psmx_info->domain_attr->cq_cnt = 65535;
	psmx_info->domain_attr->ep_cnt = 65535;
	psmx_info->domain_attr->tx_ctx_cnt = 1;
	psmx_info->domain_attr->rx_ctx_cnt = 1;
	psmx_info->domain_attr->max_ep_tx_ctx = 1;
	psmx_info->domain_attr->max_ep_rx_ctx = 1;
	psmx_info->domain_attr->max_ep_stx_ctx = 65535;
	psmx_info->domain_attr->max_ep_srx_ctx = 0;
	psmx_info->domain_attr->cntr_cnt = 65535;
	psmx_info->domain_attr->mr_iov_limit = 65535;
	psmx_info->domain_attr->caps = PSMX_DOM_CAPS;
	psmx_info->domain_attr->mode = 0;
	psmx_info->domain_attr->mr_cnt = 65535;

	psmx_info->next = NULL;
	psmx_info->caps = (hints && hints->caps) ? hints->caps : caps;
	psmx_info->mode = mode;
	psmx_info->addr_format = FI_ADDR_PSMX;
	psmx_info->src_addr = src_addr;
	psmx_info->src_addrlen = sizeof(*src_addr);
	psmx_info->dest_addr = dest_addr;
	psmx_info->dest_addrlen = sizeof(*dest_addr);
	psmx_info->fabric_attr->name = strdup(PSMX_FABRIC_NAME);
	psmx_info->fabric_attr->prov_name = NULL;
	psmx_info->fabric_attr->prov_version = OFI_VERSION_DEF_PROV;

	psmx_info->tx_attr->caps = psmx_info->caps;
	psmx_info->tx_attr->mode = psmx_info->mode;
	psmx_info->tx_attr->op_flags = (hints && hints->tx_attr && hints->tx_attr->op_flags)
					? hints->tx_attr->op_flags : 0;
	psmx_info->tx_attr->msg_order = PSMX_MSG_ORDER;
	psmx_info->tx_attr->comp_order = PSMX_COMP_ORDER;
	psmx_info->tx_attr->inject_size = PSMX_INJECT_SIZE;
	psmx_info->tx_attr->size = UINT64_MAX;
	psmx_info->tx_attr->iov_limit = 1;
	psmx_info->tx_attr->rma_iov_limit = 1;

	psmx_info->rx_attr->caps = psmx_info->caps;
	psmx_info->rx_attr->mode = psmx_info->mode;
	psmx_info->rx_attr->op_flags = (hints && hints->rx_attr && hints->rx_attr->op_flags)
					? hints->rx_attr->op_flags : 0;
	psmx_info->rx_attr->msg_order = PSMX_MSG_ORDER;
	psmx_info->rx_attr->comp_order = PSMX_COMP_ORDER;
	psmx_info->rx_attr->total_buffered_recv = ~(0ULL); /* that's how PSM handles it internally! */
	psmx_info->rx_attr->size = UINT64_MAX;
	psmx_info->rx_attr->iov_limit = 1;

	*info = psmx_info;
	return 0;

err_out:
	free(dest_addr);
	free(src_addr);

	return err;
}

static void psmx_fini(void)
{
	FI_INFO(&psmx_prov, FI_LOG_CORE, "\n");

	if (! --psmx_init_count && psmx_lib_initialized) {
		/* This function is called from a library destructor, which is called
		 * automatically when exit() is called. The call to psm_finalize()
		 * might cause deadlock if the applicaiton is terminated with Ctrl-C
		 * -- the application could be inside a PSM call, holding a lock that
		 * psm_finalize() tries to acquire. This can be avoided by only
		 * calling psm_finalize() when PSM is guaranteed to be unused.
		 */
		if (psmx_active_fabric) {
			FI_INFO(&psmx_prov, FI_LOG_CORE,
				"psmx_active_fabric != NULL, skip psm_finalize\n");
		} else {
			psm_finalize();
			psmx_lib_initialized = 0;
		}
	}
}

struct fi_provider psmx_prov = {
	.name = PSMX_PROV_NAME,
	.version = OFI_VERSION_DEF_PROV,
	.fi_version = OFI_VERSION_LATEST,
	.getinfo = psmx_getinfo,
	.fabric = psmx_fabric,
	.cleanup = psmx_fini
};

PROVIDER_INI
{
	FI_INFO(&psmx_prov, FI_LOG_CORE, "\n");

	fi_param_define(&psmx_prov, "name_server", FI_PARAM_BOOL,
			"Whether to turn on the name server or not "
			"(default: yes)");

	fi_param_define(&psmx_prov, "am_msg", FI_PARAM_BOOL,
			"Whether to use active message based messaging "
			"or not (default: no)");

	fi_param_define(&psmx_prov, "tagged_rma", FI_PARAM_BOOL,
			"Whether to use tagged messages for large size "
			"RMA or not (default: yes)");

	fi_param_define(&psmx_prov, "uuid", FI_PARAM_STRING,
			"Unique Job ID required by the fabric");

	fi_param_define(&psmx_prov, "delay", FI_PARAM_INT,
			"Delay (seconds) before finalization (for debugging)");

	fi_param_define(&psmx_prov, "timeout", FI_PARAM_INT,
			"Timeout (seconds) for gracefully closing the PSM endpoint");

	fi_param_define(&psmx_prov, "prog_thread", FI_PARAM_BOOL,
			"Whether to allow the creation of progress thread or not "
			"(default: yes)");

	fi_param_define(&psmx_prov, "prog_interval", FI_PARAM_INT,
			"Interval (microseconds) between progress calls made in the "
			"progress thread (default: 1 if affinity is set, 1000 if not)");

	fi_param_define(&psmx_prov, "prog_affinity", FI_PARAM_STRING,
			"When set, specify the set of CPU cores to set the progress "
			"thread affinity to. The format is "
			"<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*, "
			"where each triplet <start>:<end>:<stride> defines a block "
			"of core_ids. Both <start> and <end> can be either the core_id "
			"(when >=0) or core_id - num_cores (when <0). "
			"(default: affinity not set)");

	psmx_init_env();

	pthread_mutex_init(&psmx_lib_mutex, NULL);
	psmx_init_count++;
	return (&psmx_prov);
}