// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"). You may // not use this file except in compliance with the License. A copy of the // License is located at // // http://aws.amazon.com/apache2.0/ // // or in the "license" file accompanying this file. This file is distributed // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either // express or implied. See the License for the specific language governing // permissions and limitations under the License. package main import ( "fmt" "net" "os" "time" "github.com/containernetworking/cni/pkg/types" "github.com/containernetworking/cni/pkg/types/current" "github.com/containernetworking/plugins/pkg/ns" "github.com/coreos/go-iptables/iptables" "github.com/vishvananda/netlink" "github.com/aws/amazon-vpc-cni-k8s/cmd/egress-cni-plugin/snat" "github.com/aws/amazon-vpc-cni-k8s/pkg/hostipamwrapper" "github.com/aws/amazon-vpc-cni-k8s/pkg/iptableswrapper" "github.com/aws/amazon-vpc-cni-k8s/pkg/netlinkwrapper" "github.com/aws/amazon-vpc-cni-k8s/pkg/nswrapper" "github.com/aws/amazon-vpc-cni-k8s/pkg/procsyswrapper" "github.com/aws/amazon-vpc-cni-k8s/pkg/utils/cniutils" "github.com/aws/amazon-vpc-cni-k8s/pkg/utils/logger" "github.com/aws/amazon-vpc-cni-k8s/pkg/vethwrapper" ) const ( ipv4MulticastRange = "224.0.0.0/4" ipv6MulticastRange = "ff00::/8" // WaitInterval Time duration CNI waits before next check for an IPv6 address assigned to an interface // to move to stable state. WaitInterval = 50 * time.Millisecond // DadTimeout Time duration CNI waits for an IPv6 address assigned to an interface // to move to stable state before error'ing out. DadTimeout = 10 * time.Second ) // egressContext includes all info to run container ADD or DEL action type egressContext struct { Procsys procsyswrapper.ProcSys Ipam hostipamwrapper.HostIpam Link netlinkwrapper.NetLink Ns nswrapper.NS NsPath string ArgsIfName string Veth vethwrapper.Veth IPTablesIface iptableswrapper.IPTablesIface IptCreator func(iptables.Protocol) (iptableswrapper.IPTablesIface, error) NetConf *NetConf Result *current.Result TmpResult *current.Result Log logger.Logger Mtu int // SnatChain is the chain name for iptables rules SnatChain string // SnatComment is the comment for iptables rules SnatComment string } // NewEgressAddContext create a context for container egress traffic func NewEgressAddContext(nsPath, ifName string) egressContext { return egressContext{ Procsys: procsyswrapper.NewProcSys(), Ipam: hostipamwrapper.NewIpam(), Link: netlinkwrapper.NewNetLink(), Ns: nswrapper.NewNS(), NsPath: nsPath, ArgsIfName: ifName, Veth: vethwrapper.NewSetupVeth(), IptCreator: func(protocol iptables.Protocol) (iptableswrapper.IPTablesIface, error) { return iptableswrapper.NewIPTables(protocol) }, } } // NewEgressDelContext create a context for container egress traffic func NewEgressDelContext(nsPath string) egressContext { return egressContext{ Ipam: hostipamwrapper.NewIpam(), Link: netlinkwrapper.NewNetLink(), Ns: nswrapper.NewNS(), NsPath: nsPath, IptCreator: func(protocol iptables.Protocol) (iptableswrapper.IPTablesIface, error) { return iptableswrapper.NewIPTables(protocol) }, } } func (ec *egressContext) setupContainerVethV4() (*current.Interface, *current.Interface, error) { // The IPAM result will be something like IP=192.168.3.5/24, GW=192.168.3.1. // What we want is really a point-to-point link but veth does not support IFF_POINTTOPOINT. // Next best thing would be to let it ARP but set interface to 192.168.3.5/32 and // add a route like "192.168.3.0/24 via 192.168.3.1 dev $ifName". // Unfortunately that won't work as the GW will be outside the interface's subnet. // Our solution is to configure the interface with 192.168.3.5/24, then delete the // "192.168.3.0/24 dev $ifName" route that was automatically added. Then we add // "192.168.3.1/32 dev $ifName" and "192.168.3.0/24 via 192.168.3.1 dev $ifName". // In other words we force all traffic to ARP via the gateway except for GW itself. hostInterface := ¤t.Interface{} containerInterface := ¤t.Interface{} err := ec.Ns.WithNetNSPath(ec.NsPath, func(hostNS ns.NetNS) error { hostVeth, contVeth0, err := ec.Veth.Setup(ec.NetConf.IfName, ec.Mtu, hostNS) if err != nil { return err } hostInterface.Name = hostVeth.Name hostInterface.Mac = hostVeth.HardwareAddr.String() containerInterface.Name = contVeth0.Name containerInterface.Mac = contVeth0.HardwareAddr.String() containerInterface.Sandbox = ec.NsPath for _, ipc := range ec.TmpResult.IPs { // All addresses apply to the container veth interface ipc.Interface = current.Int(1) } ec.TmpResult.Interfaces = []*current.Interface{hostInterface, containerInterface} if err = ec.Ipam.ConfigureIface(ec.NetConf.IfName, ec.TmpResult); err != nil { return err } contVeth, err := ec.Link.LinkByName(ec.NetConf.IfName) if err != nil { return fmt.Errorf("failed to look up %q: %v", ec.NetConf.IfName, err) } for _, ipc := range ec.TmpResult.IPs { // Delete the route that was automatically added route := netlink.Route{ LinkIndex: contVeth.Attrs().Index, Dst: &net.IPNet{ IP: ipc.Address.IP.Mask(ipc.Address.Mask), Mask: ipc.Address.Mask, }, Scope: netlink.SCOPE_NOWHERE, } if err := ec.Link.RouteDel(&route); err != nil { return fmt.Errorf("failed to delete route %v: %v", route, err) } addrBits := 128 if ipc.Address.IP.To4() != nil { addrBits = 32 } for _, r := range []netlink.Route{ { LinkIndex: contVeth.Attrs().Index, Dst: &net.IPNet{ IP: ipc.Gateway, Mask: net.CIDRMask(addrBits, addrBits), }, Scope: netlink.SCOPE_LINK, Src: ipc.Address.IP, }, { LinkIndex: contVeth.Attrs().Index, Dst: &net.IPNet{ IP: ipc.Address.IP.Mask(ipc.Address.Mask), Mask: ipc.Address.Mask, }, Scope: netlink.SCOPE_UNIVERSE, Gw: ipc.Gateway, Src: ipc.Address.IP, }, } { if err := ec.Link.RouteAdd(&r); err != nil { return fmt.Errorf("failed to add route %v: %v", r, err) } } } return nil }) if err != nil { return nil, nil, err } return hostInterface, containerInterface, nil } func (ec *egressContext) setupHostVethV4(vethName string) error { // hostVeth moved namespaces and may have a new ifindex veth, err := ec.Link.LinkByName(vethName) if err != nil { return fmt.Errorf("failed to lookup %q: %v", vethName, err) } for _, ipc := range ec.TmpResult.IPs { maskLen := 128 if ipc.Address.IP.To4() != nil { maskLen = 32 } // NB: this is modified from standard ptp plugin. ipn := &net.IPNet{ IP: ipc.Gateway, Mask: net.CIDRMask(maskLen, maskLen), } addr := &netlink.Addr{ IPNet: ipn, Scope: int(netlink.SCOPE_LINK), // <- ptp uses SCOPE_UNIVERSE here } if err = ec.Link.AddrAdd(veth, addr); err != nil { return fmt.Errorf("failed to add IP addr (%#v) to veth: %v", ipn, err) } ipn = &net.IPNet{ IP: ipc.Address.IP, Mask: net.CIDRMask(maskLen, maskLen), } err := ec.Link.RouteAdd(&netlink.Route{ LinkIndex: veth.Attrs().Index, Scope: netlink.SCOPE_LINK, // <- ptp uses SCOPE_HOST here Dst: ipn, }) if err != nil && !os.IsExist(err) { return fmt.Errorf("failed to add route on host: %v", err) } } return nil } // cmdAddEgressV4 exec necessary settings to support IPv4 egress traffic in EKS IPv6 cluster func (ec *egressContext) cmdAddEgressV4() (err error) { if ec.IPTablesIface == nil { if ec.IPTablesIface, err = ec.IptCreator(iptables.ProtocolIPv4); err != nil { ec.Log.Error("command iptables not found") return err } } if err = cniutils.EnableIpForwarding(ec.Procsys, ec.TmpResult.IPs); err != nil { return fmt.Errorf("could not enable IP forwarding: %v", err) } // NB: This uses netConf.IfName NOT args.IfName. hostInterface, _, err := ec.setupContainerVethV4() if err != nil { ec.Log.Debugf("failed to setup container Veth: %v", err) return err } if err = ec.setupHostVethV4(hostInterface.Name); err != nil { return err } ec.Log.Debugf("Node IP: %s", ec.NetConf.NodeIP) if ec.NetConf.NodeIP != nil { for _, ipc := range ec.TmpResult.IPs { if ipc.Address.IP.To4() != nil { // add SNAT chain/rules necessary for the container IPv6 egress traffic if err = snat.Add(ec.IPTablesIface, ec.NetConf.NodeIP, ipc.Address.IP, ipv4MulticastRange, ec.SnatChain, ec.SnatComment, ec.NetConf.RandomizeSNAT); err != nil { return err } } } } // Copy interfaces over to result, but not IPs. ec.Result.Interfaces = append(ec.Result.Interfaces, ec.TmpResult.Interfaces...) // Pass through the previous result return types.PrintResult(ec.Result, ec.NetConf.CNIVersion) } // cmdDelEgressV4 exec clear the setting to support IPv4 egress traffic in EKS IPv6 cluster func (ec *egressContext) cmdDelEgress(ipv4 bool) (err error) { var contIPAddrs []netlink.Addr protocol := iptables.ProtocolIPv4 ipFamily := netlink.FAMILY_V4 if !ipv4 { protocol = iptables.ProtocolIPv6 ipFamily = netlink.FAMILY_V6 } if ec.IPTablesIface == nil { if ec.IPTablesIface, err = ec.IptCreator(protocol); err != nil { ec.Log.Error("command iptables not found") // without iptables ir ip6tables, chain/rules could not be removed return err } } if ec.NsPath != "" { _ = ec.Ns.WithNetNSPath(ec.NsPath, func(hostNS ns.NetNS) error { // DelLinkByNameAddr function deletes a link and returns IPs assigned to it, but it // excludes IPs that are not global unicast addresses (or) private IPs. Will not work for // our scenario as we use 169.254.0.0/16 range for v4 IPs. var _err error var link netlink.Link link, _err = ec.Link.LinkByName(ec.NetConf.IfName) if _err != nil { if !cniutils.IsLinkNotFoundError(_err) { ec.Log.Errorf("failed to get container link by name %s: %v", ec.NetConf.IfName, _err) } return nil } //Retrieve IP addresses assigned to the link contIPAddrs, _err = ec.Link.AddrList(link, ipFamily) if _err != nil { ec.Log.Errorf("failed to get IP addresses for link %s: %v", ec.NetConf.IfName, _err) } return _err }) } for _, ipAddr := range contIPAddrs { // for IPv4 egress, IP address is a link-local IPv4 address // for IPv6 egress, IP address is a unique-local IPv6 address // NOTE: IsGlobalUnicast returns true for unique-local IPv6 address if (ipv4 && ipAddr.IP.To4() != nil && ipAddr.IP.IsLinkLocalUnicast()) || (!ipv4 && ipAddr.IP.To4() == nil && ipAddr.IP.IsGlobalUnicast()) { err = snat.Del(ec.IPTablesIface, ipAddr.IP, ec.SnatChain, ec.SnatComment) if err != nil { ec.Log.Errorf("failed to remove iptables chain %s: %v", ec.SnatChain, err) } else { ec.Log.Infof("successfully removed iptables chain %s", ec.SnatChain) } } } return nil } // cmdAddEgressV6 exec necessary settings to support IPv6 egress traffic in EKS IPv4 cluster func (ec *egressContext) cmdAddEgressV6() (err error) { // Per best practice, a new veth pair is created between container ns and node ns // this newly created veth pair is used for container's egress IPv6 traffic // NOTE: // 1. link-local IPv6 addresses are automatically assigned to veth's both ends. // 2. unique-local IPv6 address allocated from host-local IPAM plugin is assigned to veth's container end only // 3. veth node end has no unique-local IPv6 address assigned, only link-local IPv6 address // 4. container IPv6 egress traffic go through node primary interface (eth0) which has an IPv6 global unicast address // 5. IPv6 egress traffic of all containers in a node shares node primary interface (eth0) through SNAT if ec.IPTablesIface == nil { if ec.IPTablesIface, err = ec.IptCreator(iptables.ProtocolIPv6); err != nil { ec.Log.Error("command ip6tables not found") return err } } // first disable IPv6 on container's primary interface (eth0) err = ec.disableContainerInterfaceIPv6(ec.ArgsIfName) if err != nil { ec.Log.Errorf("failed to disable IPv6 on container interface %s", ec.ArgsIfName) return err } hostInterface, containerInterface, err := ec.setupContainerVethV6() if err != nil { ec.Log.Errorf("veth created failed, ns: %s name: %s, mtu: %d, ipam-result: %+v err: %v", ec.NsPath, ec.NetConf.IfName, ec.Mtu, *ec.TmpResult, err) return err } ec.Log.Debugf("veth pair created for container IPv6 egress traffic, container interface: %s ,host interface: %s", containerInterface.Name, hostInterface.Name) containerIPv6 := ec.TmpResult.IPs[0].Address.IP err = ec.setupContainerIPv6Route(hostInterface, containerInterface) if err != nil { ec.Log.Errorf("setupContainerIPv6Route failed: %v", err) return err } ec.Log.Debugf("container route set up successfully") err = ec.setupHostIPv6Route(hostInterface, containerIPv6) if err != nil { ec.Log.Errorf("setupHostIPv6Route failed: %v", err) return err } ec.Log.Debugf("host IPv6 route set up successfully") // set up SNAT in host for container IPv6 egress traffic // following line adds an ip6tables entries to NAT for IPv6 traffic between container v6if0 and node primary ENI (eth0) err = snat.Add(ec.IPTablesIface, ec.NetConf.NodeIP, containerIPv6, ipv6MulticastRange, ec.SnatChain, ec.SnatComment, ec.NetConf.RandomizeSNAT) if err != nil { ec.Log.Errorf("setup host snat failed: %v", err) return err } ec.Log.Debugf("host IPv6 SNAT set up successfully") // Copy interfaces over to result, but not IPs. ec.Result.Interfaces = append(ec.Result.Interfaces, ec.TmpResult.Interfaces...) // Pass through the previous result return types.PrintResult(ec.Result, ec.NetConf.CNIVersion) } func (ec *egressContext) disableContainerInterfaceIPv6(ifName string) error { return ec.Ns.WithNetNSPath(ec.NsPath, func(hostNS ns.NetNS) error { var entry = "net/ipv6/conf/" + ifName + "/disable_ipv6" return ec.Procsys.Set(entry, "1") }) } func (ec *egressContext) setupContainerIPv6Route(hostInterface, containerInterface *current.Interface) (err error) { var hostIfIPv6 net.IP var hostNetIf netlink.Link var addrs []netlink.Addr hostNetIf, err = ec.Link.LinkByName(hostInterface.Name) if err != nil { return err } addrs, err = ec.Link.AddrList(hostNetIf, netlink.FAMILY_V6) if err != nil { return err } for _, addr := range addrs { // search for interface's link-local IPv6 address if addr.IP.To4() == nil && addr.IP.IsLinkLocalUnicast() { hostIfIPv6 = addr.IP break } } if hostIfIPv6 == nil { return fmt.Errorf("link-local IPv6 address not found on host interface %s", hostInterface.Name) } return ec.Ns.WithNetNSPath(ec.NsPath, func(hostNS ns.NetNS) error { var containerVethIf netlink.Link containerVethIf, err = ec.Link.LinkByName(containerInterface.Name) if err != nil { return err } // set up from container off-cluster IPv6 route (egress) // all from container IPv6 traffic via host veth interface's link-local IPv6 address if err := ec.Link.RouteReplace(&netlink.Route{ LinkIndex: containerVethIf.Attrs().Index, Dst: &net.IPNet{ IP: net.IPv6zero, Mask: net.CIDRMask(0, 128), }, Scope: netlink.SCOPE_UNIVERSE, Gw: hostIfIPv6}); err != nil { return fmt.Errorf("failed to add default IPv6 route via %s: %v", hostIfIPv6, err) } return nil }) } // setupHostIPv6Route adds a IPv6 route for traffic destined to container/pod from external/off-cluster func (ec *egressContext) setupHostIPv6Route(hostInterface *current.Interface, containerIPv6 net.IP) error { link := ec.Link hostIf, err := link.LinkByName(hostInterface.Name) if err != nil { return err } // set up to container return traffic route in host return link.RouteAdd(&netlink.Route{ LinkIndex: hostIf.Attrs().Index, Scope: netlink.SCOPE_HOST, Dst: &net.IPNet{ IP: containerIPv6, Mask: net.CIDRMask(128, 128), }, }) } func (ec *egressContext) setupContainerVethV6() (hostInterface, containerInterface *current.Interface, err error) { err = ec.Ns.WithNetNSPath(ec.NsPath, func(hostNS ns.NetNS) error { var hostVeth net.Interface var contVeth net.Interface hostVeth, contVeth, err = ec.Veth.Setup(ec.NetConf.IfName, ec.Mtu, hostNS) if err != nil { return err } hostInterface = ¤t.Interface{ Name: hostVeth.Name, Mac: hostVeth.HardwareAddr.String(), } containerInterface = ¤t.Interface{ Name: contVeth.Name, Mac: contVeth.HardwareAddr.String(), Sandbox: ec.NsPath, } ec.TmpResult.Interfaces = []*current.Interface{hostInterface, containerInterface} for _, ipc := range ec.TmpResult.IPs { // Address (IPv6 ULA address) apply to the container veth interface - v6if0 ipc.Interface = current.Int(1) } err = ec.Ipam.ConfigureIface(ec.NetConf.IfName, ec.TmpResult) if err != nil { return err } return cniutils.WaitForAddressesToBeStable(ec.Link, contVeth.Name, DadTimeout, WaitInterval) }) return hostInterface, containerInterface, err } func (ec *egressContext) hostLocalIpamAdd(stdinData []byte) (err error) { var ipamResultI types.Result if ipamResultI, err = ec.Ipam.ExecAdd(ec.NetConf.IPAM.Type, stdinData); err != nil { return fmt.Errorf("running IPAM plugin failed: %v", err) } if ec.TmpResult, err = current.NewResultFromResult(ipamResultI); err != nil { return err } ipCount := len(ec.TmpResult.IPs) if ipCount == 0 { return fmt.Errorf("IPAM plugin returned zero IPs") } else if ipCount > 1 { return fmt.Errorf("IPAM plugin is expected to return 1 IP address, but returned %d IPs, ", ipCount) } return nil }