Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
[pandora-kernel.git] / net / ipv6 / addrconf.c
index 5b0c041..030fefd 100644 (file)
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
        .max_addresses          = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr       = 1,
        .accept_ra_from_local   = 0,
+       .accept_ra_min_hop_limit= 1,
        .accept_ra_pinfo        = 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref     = 1,
@@ -211,7 +212,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
        .accept_ra_mtu          = 1,
        .stable_secret          = {
                .initialized = false,
-       }
+       },
+       .use_oif_addrs_only     = 0,
+       .ignore_routes_with_linkdown = 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -236,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
        .max_addresses          = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr       = 1,
        .accept_ra_from_local   = 0,
+       .accept_ra_min_hop_limit= 1,
        .accept_ra_pinfo        = 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref     = 1,
@@ -253,6 +257,8 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
        .stable_secret          = {
                .initialized = false,
        },
+       .use_oif_addrs_only     = 0,
+       .ignore_routes_with_linkdown = 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -468,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type)
        if (type == -1 || type == NETCONFA_PROXY_NEIGH)
                size += nla_total_size(4);
 
+       if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+               size += nla_total_size(4);
+
        return size;
 }
 
@@ -504,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
            nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
                goto nla_put_failure;
 
+       if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+           nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+                       devconf->ignore_routes_with_linkdown) < 0)
+               goto nla_put_failure;
+
        nlmsg_end(skb, nlh);
        return 0;
 
@@ -540,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
        [NETCONFA_IFINDEX]      = { .len = sizeof(int) },
        [NETCONFA_FORWARDING]   = { .len = sizeof(int) },
        [NETCONFA_PROXY_NEIGH]  = { .len = sizeof(int) },
+       [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]  = { .len = sizeof(int) },
 };
 
 static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
@@ -762,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
                rt6_purge_dflt_routers(net);
        return 1;
 }
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+       struct net_device *dev;
+       struct inet6_dev *idev;
+
+       for_each_netdev(net, dev) {
+               idev = __in6_dev_get(dev);
+               if (idev) {
+                       int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+                       idev->cnf.ignore_routes_with_linkdown = newf;
+                       if (changed)
+                               inet6_netconf_notify_devconf(dev_net(dev),
+                                                            NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+                                                            dev->ifindex,
+                                                            &idev->cnf);
+               }
+       }
+}
+
+static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+{
+       struct net *net;
+       int old;
+
+       if (!rtnl_trylock())
+               return restart_syscall();
+
+       net = (struct net *)table->extra2;
+       old = *p;
+       *p = newf;
+
+       if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+               if ((!newf) ^ (!old))
+                       inet6_netconf_notify_devconf(net,
+                                                    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+                                                    NETCONFA_IFINDEX_DEFAULT,
+                                                    net->ipv6.devconf_dflt);
+               rtnl_unlock();
+               return 0;
+       }
+
+       if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+               net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+               addrconf_linkdown_change(net, newf);
+               if ((!newf) ^ (!old))
+                       inet6_netconf_notify_devconf(net,
+                                                    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+                                                    NETCONFA_IFINDEX_ALL,
+                                                    net->ipv6.devconf_all);
+       }
+       rtnl_unlock();
+
+       return 1;
+}
+
 #endif
 
 /* Nobody refers to this ifaddr, destroy it */
@@ -1358,15 +1430,96 @@ out:
        return ret;
 }
 
+static int __ipv6_dev_get_saddr(struct net *net,
+                               struct ipv6_saddr_dst *dst,
+                               struct inet6_dev *idev,
+                               struct ipv6_saddr_score *scores,
+                               int hiscore_idx)
+{
+       struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+       read_lock_bh(&idev->lock);
+       list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+               int i;
+
+               /*
+                * - Tentative Address (RFC2462 section 5.4)
+                *  - A tentative address is not considered
+                *    "assigned to an interface" in the traditional
+                *    sense, unless it is also flagged as optimistic.
+                * - Candidate Source Address (section 4)
+                *  - In any case, anycast addresses, multicast
+                *    addresses, and the unspecified address MUST
+                *    NOT be included in a candidate set.
+                */
+               if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+                   (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+                       continue;
+
+               score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+               if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+                            score->addr_type & IPV6_ADDR_MULTICAST)) {
+                       net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+                                           idev->dev->name);
+                       continue;
+               }
+
+               score->rule = -1;
+               bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+               for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+                       int minihiscore, miniscore;
+
+                       minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+                       miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+                       if (minihiscore > miniscore) {
+                               if (i == IPV6_SADDR_RULE_SCOPE &&
+                                   score->scopedist > 0) {
+                                       /*
+                                        * special case:
+                                        * each remaining entry
+                                        * has too small (not enough)
+                                        * scope, because ifa entries
+                                        * are sorted by their scope
+                                        * values.
+                                        */
+                                       goto out;
+                               }
+                               break;
+                       } else if (minihiscore < miniscore) {
+                               if (hiscore->ifa)
+                                       in6_ifa_put(hiscore->ifa);
+
+                               in6_ifa_hold(score->ifa);
+
+                               swap(hiscore, score);
+                               hiscore_idx = 1 - hiscore_idx;
+
+                               /* restore our iterator */
+                               score->ifa = hiscore->ifa;
+
+                               break;
+                       }
+               }
+       }
+out:
+       read_unlock_bh(&idev->lock);
+       return hiscore_idx;
+}
+
 int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
                       const struct in6_addr *daddr, unsigned int prefs,
                       struct in6_addr *saddr)
 {
-       struct ipv6_saddr_score scores[2],
-                               *score = &scores[0], *hiscore = &scores[1];
+       struct ipv6_saddr_score scores[2], *hiscore;
        struct ipv6_saddr_dst dst;
+       struct inet6_dev *idev;
        struct net_device *dev;
        int dst_type;
+       bool use_oif_addr = false;
+       int hiscore_idx = 0;
 
        dst_type = __ipv6_addr_type(daddr);
        dst.addr = daddr;
@@ -1375,105 +1528,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
        dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
        dst.prefs = prefs;
 
-       hiscore->rule = -1;
-       hiscore->ifa = NULL;
+       scores[hiscore_idx].rule = -1;
+       scores[hiscore_idx].ifa = NULL;
 
        rcu_read_lock();
 
-       for_each_netdev_rcu(net, dev) {
-               struct inet6_dev *idev;
-
-               /* Candidate Source Address (section 4)
-                *  - multicast and link-local destination address,
-                *    the set of candidate source address MUST only
-                *    include addresses assigned to interfaces
-                *    belonging to the same link as the outgoing
-                *    interface.
-                * (- For site-local destination addresses, the
-                *    set of candidate source addresses MUST only
-                *    include addresses assigned to interfaces
-                *    belonging to the same site as the outgoing
-                *    interface.)
-                */
-               if (((dst_type & IPV6_ADDR_MULTICAST) ||
-                    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
-                   dst.ifindex && dev->ifindex != dst.ifindex)
-                       continue;
-
-               idev = __in6_dev_get(dev);
-               if (!idev)
-                       continue;
-
-               read_lock_bh(&idev->lock);
-               list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
-                       int i;
-
-                       /*
-                        * - Tentative Address (RFC2462 section 5.4)
-                        *  - A tentative address is not considered
-                        *    "assigned to an interface" in the traditional
-                        *    sense, unless it is also flagged as optimistic.
-                        * - Candidate Source Address (section 4)
-                        *  - In any case, anycast addresses, multicast
-                        *    addresses, and the unspecified address MUST
-                        *    NOT be included in a candidate set.
-                        */
-                       if ((score->ifa->flags & IFA_F_TENTATIVE) &&
-                           (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
-                               continue;
-
-                       score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+       /* Candidate Source Address (section 4)
+        *  - multicast and link-local destination address,
+        *    the set of candidate source address MUST only
+        *    include addresses assigned to interfaces
+        *    belonging to the same link as the outgoing
+        *    interface.
+        * (- For site-local destination addresses, the
+        *    set of candidate source addresses MUST only
+        *    include addresses assigned to interfaces
+        *    belonging to the same site as the outgoing
+        *    interface.)
+        *  - "It is RECOMMENDED that the candidate source addresses
+        *    be the set of unicast addresses assigned to the
+        *    interface that will be used to send to the destination
+        *    (the 'outgoing' interface)." (RFC 6724)
+        */
+       if (dst_dev) {
+               idev = __in6_dev_get(dst_dev);
+               if ((dst_type & IPV6_ADDR_MULTICAST) ||
+                   dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+                   (idev && idev->cnf.use_oif_addrs_only)) {
+                       use_oif_addr = true;
+               }
+       }
 
-                       if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
-                                    score->addr_type & IPV6_ADDR_MULTICAST)) {
-                               net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
-                                                   dev->name);
+       if (use_oif_addr) {
+               if (idev)
+                       hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+       } else {
+               for_each_netdev_rcu(net, dev) {
+                       idev = __in6_dev_get(dev);
+                       if (!idev)
                                continue;
-                       }
-
-                       score->rule = -1;
-                       bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
-
-                       for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
-                               int minihiscore, miniscore;
-
-                               minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
-                               miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
-
-                               if (minihiscore > miniscore) {
-                                       if (i == IPV6_SADDR_RULE_SCOPE &&
-                                           score->scopedist > 0) {
-                                               /*
-                                                * special case:
-                                                * each remaining entry
-                                                * has too small (not enough)
-                                                * scope, because ifa entries
-                                                * are sorted by their scope
-                                                * values.
-                                                */
-                                               goto try_nextdev;
-                                       }
-                                       break;
-                               } else if (minihiscore < miniscore) {
-                                       if (hiscore->ifa)
-                                               in6_ifa_put(hiscore->ifa);
-
-                                       in6_ifa_hold(score->ifa);
-
-                                       swap(hiscore, score);
-
-                                       /* restore our iterator */
-                                       score->ifa = hiscore->ifa;
-
-                                       break;
-                               }
-                       }
+                       hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
                }
-try_nextdev:
-               read_unlock_bh(&idev->lock);
        }
        rcu_read_unlock();
 
+       hiscore = &scores[hiscore_idx];
        if (!hiscore->ifa)
                return -EADDRNOTAVAIL;
 
@@ -3527,7 +3625,7 @@ static void addrconf_dad_work(struct work_struct *w)
 
        /* send a neighbour solicitation for our addr */
        addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
-       ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+       ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL);
 out:
        in6_ifa_put(ifp);
        rtnl_unlock();
@@ -4529,6 +4627,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
        array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
        array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
        array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+       array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
        array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
        array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -4554,7 +4653,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
        array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
        array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
        array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+       array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
        /* we omit DEVCONF_STABLE_SECRET for now */
+       array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -4574,6 +4675,7 @@ static inline size_t inet6_if_nlmsg_size(void)
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
               + nla_total_size(4) /* IFLA_MTU */
               + nla_total_size(4) /* IFLA_LINK */
+              + nla_total_size(1) /* IFLA_OPERSTATE */
               + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
 }
 
@@ -4593,18 +4695,24 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
 }
 
 static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
-                                     int items, int bytes, size_t syncpoff)
+                                       int bytes, size_t syncpoff)
 {
-       int i;
-       int pad = bytes - sizeof(u64) * items;
+       int i, c;
+       u64 buff[IPSTATS_MIB_MAX];
+       int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX;
+
        BUG_ON(pad < 0);
 
-       /* Use put_unaligned() because stats may not be aligned for u64. */
-       put_unaligned(items, &stats[0]);
-       for (i = 1; i < items; i++)
-               put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+       memset(buff, 0, sizeof(buff));
+       buff[0] = IPSTATS_MIB_MAX;
 
-       memset(&stats[items], 0, pad);
+       for_each_possible_cpu(c) {
+               for (i = 1; i < IPSTATS_MIB_MAX; i++)
+                       buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff);
+       }
+
+       memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64));
+       memset(&stats[IPSTATS_MIB_MAX], 0, pad);
 }
 
 static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
@@ -4612,8 +4720,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
 {
        switch (attrtype) {
        case IFLA_INET6_STATS:
-               __snmp6_fill_stats64(stats, idev->stats.ipv6,
-                                    IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
+               __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes,
+                                    offsetof(struct ipstats_mib, syncp));
                break;
        case IFLA_INET6_ICMP6STATS:
                __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
@@ -4830,7 +4938,9 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
             nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
            nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
            (dev->ifindex != dev_get_iflink(dev) &&
-            nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+            nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
+           nla_put_u8(skb, IFLA_OPERSTATE,
+                      netif_running(dev) ? dev->operstate : IF_OPER_DOWN))
                goto nla_put_failure;
        protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
        if (!protoinfo)
@@ -5275,6 +5385,34 @@ out:
        return err;
 }
 
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
+                                               int write,
+                                               void __user *buffer,
+                                               size_t *lenp,
+                                               loff_t *ppos)
+{
+       int *valp = ctl->data;
+       int val = *valp;
+       loff_t pos = *ppos;
+       struct ctl_table lctl;
+       int ret;
+
+       /* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+        * we should not modify it until we get the rtnl lock.
+        */
+       lctl = *ctl;
+       lctl.data = &val;
+
+       ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+       if (write)
+               ret = addrconf_fixup_linkdown(ctl, valp, val);
+       if (ret)
+               *ppos = pos;
+       return ret;
+}
+
 static struct addrconf_sysctl_table
 {
        struct ctl_table_header *sysctl_header;
@@ -5424,6 +5562,13 @@ static struct addrconf_sysctl_table
                        .mode           = 0644,
                        .proc_handler   = proc_dointvec,
                },
+               {
+                       .procname       = "accept_ra_min_hop_limit",
+                       .data           = &ipv6_devconf.accept_ra_min_hop_limit,
+                       .maxlen         = sizeof(int),
+                       .mode           = 0644,
+                       .proc_handler   = proc_dointvec,
+               },
                {
                        .procname       = "accept_ra_pinfo",
                        .data           = &ipv6_devconf.accept_ra_pinfo,
@@ -5553,6 +5698,20 @@ static struct addrconf_sysctl_table
                        .mode           = 0600,
                        .proc_handler   = addrconf_sysctl_stable_secret,
                },
+               {
+                       .procname       = "use_oif_addrs_only",
+                       .data           = &ipv6_devconf.use_oif_addrs_only,
+                       .maxlen         = sizeof(int),
+                       .mode           = 0644,
+                       .proc_handler   = proc_dointvec,
+               },
+               {
+                       .procname       = "ignore_routes_with_linkdown",
+                       .data           = &ipv6_devconf.ignore_routes_with_linkdown,
+                       .maxlen         = sizeof(int),
+                       .mode           = 0644,
+                       .proc_handler   = addrconf_sysctl_ignore_routes_with_linkdown,
+               },
                {
                        /* sentinel */
                }