/* * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. * * LICENSE_BEGIN * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * LICENSE_END * * */ #include #include #include #include #include "libnl_utils.h" #include "usnic_user_utils.h" #if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR #define usnic_nlmsg_dump(msg) nl_msg_dump(msg, stderr) #else #define usnic_nlmsg_dump(msg) #endif /* * Querying the routing tables via netlink is expensive, especially * when many processes are doing so at the same time on a single * server (e.g., in an MPI job). As such, we cache netlink responses * to alleviate pressure on the netlink kernel interface. */ struct usd_nl_cache_entry { time_t timestamp; uint32_t src_ipaddr_be; uint32_t dest_ipaddr_be; int ifindex; uint32_t nh_addr; int reachable; /* For now, this cache is a simple linked list. Eventually, * this cache should be a better data structure, such as a * hash table. */ struct usd_nl_cache_entry *prev; struct usd_nl_cache_entry *next; }; /* Semi-arbitrarily set cache TTL to 2 minutes */ static time_t usd_nl_cache_timeout = 120; static struct usd_nl_cache_entry *cache = NULL; static struct nla_policy route_policy[RTA_MAX+1] = { [RTA_IIF] = { .type = NLA_STRING, .maxlen = IFNAMSIZ, }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_MP_ALGO] = { .type = NLA_U32 }, [RTA_CACHEINFO] = { .minlen = sizeof(struct rta_cacheinfo) }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .type = NLA_NESTED }, }; static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk, struct nlmsghdr *nlm_hdr) { if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh) || nlm_hdr->nlmsg_seq != unlsk->seq) { usnic_err("Not an expected reply msg pid: %u local pid: %u msg seq: %u expected seq: %u\n", nlm_hdr->nlmsg_pid, nl_socket_get_local_port(unlsk->nlh), nlm_hdr->nlmsg_seq, unlsk->seq); return 0; } return 1; } static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr, struct usnic_rt_cb_arg *arg) { if (nlm_hdr->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr); if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e))) { usnic_strerror(e->error, "Received a netlink error message"); /* Sometimes nl_send() succeeds, but the * request fails because the kernel is * temporarily out of resources. In these * cases, we should tell the caller that they * should try again. */ if (e->error == -ECONNREFUSED) { arg->retry = 1; } } else usnic_err( "Received a truncated netlink error message\n"); return 1; } return 0; } static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, struct nl_msg *msg, int protocol, int flag) { int ret, retry; struct nlmsghdr *nlhdr; nlhdr = nlmsg_hdr(msg); while (1) { nlhdr->nlmsg_pid = nl_socket_get_local_port(unlsk->nlh); nlhdr->nlmsg_seq = ++unlsk->seq; nlmsg_set_proto(msg, protocol); nlhdr->nlmsg_flags = flag; /* Sometimes nl_send() can fail simply because the * kernel is temporarily out of resources, and we * should just try again. libnl1 and libnl3 handle * this case a little differently, so use the * USD_NL_SEND() macro to hide the differences. If * retry comes back as true, then sleep a little and * try again. */ USD_NL_SEND(unlsk->nlh, msg, ret, retry); if (retry) { usleep(5); continue; } break; } return ret; } static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh) { int err = 0; struct timeval timeout; timeout.tv_sec = 1; timeout.tv_usec = 0; err = setsockopt(nl_socket_get_fd(nlh), SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); if (err < 0) usnic_perr("Failed to set SO_RCVTIMEO for nl socket"); return err; } static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol) { struct usnic_nl_sk *unlsk; NL_HANDLE *nlh; int err; unlsk = calloc(1, sizeof(*unlsk)); if (!unlsk) { usnic_err("Failed to allocate usnic_nl_sk struct\n"); return ENOMEM; } nlh = NL_HANDLE_ALLOC(); if (!nlh) { usnic_err("Failed to allocate nl handle\n"); err = ENOMEM; goto err_free_unlsk; } err = nl_connect(nlh, protocol); if (err < 0) { usnic_err("Failed to connnect netlink route socket error: %s\n", NL_GETERROR(err)); err = EINVAL; goto err_free_nlh; } NL_DISABLE_SEQ_CHECK(nlh); err = usnic_nl_set_rcvsk_timer(nlh); if (err < 0) goto err_close_nlh; unlsk->nlh = nlh; unlsk->seq = time(NULL); *p_sk = unlsk; return 0; err_close_nlh: nl_close(nlh); err_free_nlh: NL_HANDLE_FREE(nlh); err_free_unlsk: free(unlsk); return err; } static void usnic_nl_sk_free(struct usnic_nl_sk *unlsk) { nl_close(unlsk->nlh); NL_HANDLE_FREE(unlsk->nlh); free(unlsk); } static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg) { struct usnic_rt_cb_arg *lookup_arg = (struct usnic_rt_cb_arg *)arg; struct usnic_nl_sk *unlsk = lookup_arg->unlsk; struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX + 1]; int found = 0; int err; INC_CB_MSGCNT(lookup_arg); if (!usnic_is_nlreply_expected(unlsk, nlm_hdr)) { usnic_nlmsg_dump(msg); return NL_SKIP; } if (usnic_is_nlreply_err(nlm_hdr, lookup_arg)) { usnic_nlmsg_dump(msg); return NL_SKIP; } if (nlm_hdr->nlmsg_type != RTM_NEWROUTE) { char buf[128]; nl_nlmsgtype2str(nlm_hdr->nlmsg_type, buf, sizeof(buf)); usnic_err("Received an invalid route request reply message type: %s\n", buf); usnic_nlmsg_dump(msg); return NL_SKIP; } rtm = nlmsg_data(nlm_hdr); if (rtm->rtm_family != AF_INET) { usnic_err("RTM message contains invalid AF family: %u\n", rtm->rtm_family); usnic_nlmsg_dump(msg); return NL_SKIP; } err = nlmsg_parse(nlm_hdr, sizeof(struct rtmsg), tb, RTA_MAX, route_policy); if (err < 0) { usnic_err("nlmsg parse error %s\n", NL_GETERROR(err)); usnic_nlmsg_dump(msg); return NL_SKIP; } if (tb[RTA_OIF]) { if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif) found = 1; else usnic_err("Retrieved route has a different outgoing interface %d (expected %d)\n", nla_get_u32(tb[RTA_OIF]), lookup_arg->oif); } if (found && tb[RTA_GATEWAY]) lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]); lookup_arg->found = found; return NL_STOP; } static struct usd_nl_cache_entry * usd_nl_cache_lookup(uint32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex) { time_t now; struct usd_nl_cache_entry *nlce; struct usd_nl_cache_entry *stale; now = time(NULL); for (nlce = cache; NULL != nlce; ) { /* While we're traversing the cache, we might as well * remove stale entries */ if (now > nlce->timestamp + usd_nl_cache_timeout) { stale = nlce; nlce = nlce->next; if (stale->prev) { stale->prev->next = stale->next; } if (stale->next) { stale->next->prev = stale->prev; } if (cache == stale) { cache = nlce; } free(stale); continue; } if (nlce->src_ipaddr_be == src_ipaddr_be && nlce->dest_ipaddr_be == dest_ipaddr_be && nlce->ifindex == ifindex) { return nlce; } nlce = nlce->next; } return NULL; } static void usd_nl_cache_save(int32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex, uint32_t nh_addr, int reachable) { struct usd_nl_cache_entry *nlce; nlce = calloc(1, sizeof(*nlce)); if (NULL == nlce) { return; } nlce->timestamp = time(NULL); nlce->src_ipaddr_be = src_ipaddr_be; nlce->dest_ipaddr_be = dest_ipaddr_be; nlce->ifindex = ifindex; nlce->nh_addr = nh_addr; nlce->reachable = reachable; nlce->next = cache; if (cache) { cache->prev = nlce; } cache = nlce; } int usnic_nl_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif, uint32_t *nh_addr) { struct usnic_nl_sk *unlsk; struct nl_msg *nlm; struct rtmsg rmsg; struct usnic_rt_cb_arg arg; int err; /* See if we have this NL result cached */ struct usd_nl_cache_entry *nlce; nlce = usd_nl_cache_lookup(src_addr, dst_addr, oif); if (nlce) { if (nlce->reachable) { *nh_addr = nlce->nh_addr; return 0; } else { return EHOSTUNREACH; } } retry: unlsk = NULL; err = usnic_nl_sk_alloc(&unlsk, NETLINK_ROUTE); if (err) return err; memset(&rmsg, 0, sizeof(rmsg)); rmsg.rtm_family = AF_INET; rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT; rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT; nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0); if (!nlm) { usnic_err("Failed to alloc nl message, %s\n", NL_GETERROR(err)); err = ENOMEM; goto out; } nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO); nla_put_u32(nlm, RTA_DST, dst_addr); nla_put_u32(nlm, RTA_SRC, src_addr); err = usnic_nl_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST); nlmsg_free(nlm); if (err < 0) { usnic_err("Failed to send RTM_GETROUTE query message, error %s\n", NL_GETERROR(err)); err = EINVAL; goto out; } memset(&arg, 0, sizeof(arg)); arg.oif = oif; arg.unlsk = unlsk; err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM, usnic_rt_raw_parse_cb, &arg); if (err != 0) { usnic_err("Failed to setup callback function, error %s\n", NL_GETERROR(err)); err = EINVAL; goto out; } /* Sometimes the recvmsg can fail because something is * temporarily out of resources. In this case, delay a little * and try again. */ do { err = 0; NL_RECVMSGS(unlsk->nlh, arg, EAGAIN, err, out); if (err == EAGAIN) { usleep(5); } } while (err == EAGAIN); /* If we got a reply back that indicated that the kernel was * too busy to handle this request, delay a little and try * again. */ if (arg.retry) { usleep(5); goto retry; } if (arg.found) { *nh_addr = arg.nh_addr; err = 0; } else { err = EHOSTUNREACH; } /* Save this result in the cache */ usd_nl_cache_save(src_addr, dst_addr, oif, arg.nh_addr, arg.found); out: usnic_nl_sk_free(unlsk); return err; }