* (rco@di.uminho.pt) Routing table insertion and update
* Linus Torvalds : Rewrote bits to be sensible
* Alan Cox : Added BSD route gw semantics
- * Alan Cox : Super /proc >4K
+ * Alan Cox : Super /proc >4K
* Alan Cox : MTU in route table
* Alan Cox : MSS actually. Also added the window
* clamper.
* Alan Cox : Faster /proc handling
* Alexey Kuznetsov : Massive rework to support tree based routing,
* routing caches and better behaviour.
- *
+ *
* Olaf Erb : irtt wasn't being copied right.
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <net/xfrm.h>
#include <net/ip_mp_alg.h>
#include <net/netevent.h>
+#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
{
struct rt_cache_iter_state *st = rcu_dereference(seq->private);
- r = r->u.rt_next;
+ r = r->u.dst.rt_next;
while (!r) {
rcu_read_unlock_bh();
if (--st->bucket < 0)
dev_queue_xmit) : 0,
r->rt_spec_dst);
seq_printf(seq, "%-127s\n", temp);
- }
- return 0;
+ }
+ return 0;
}
-static struct seq_operations rt_cache_seq_ops = {
+static const struct seq_operations rt_cache_seq_ops = {
.start = rt_cache_seq_start,
.next = rt_cache_seq_next,
.stop = rt_cache_seq_stop,
goto out;
}
-static struct file_operations rt_cache_seq_fops = {
+static const struct file_operations rt_cache_seq_fops = {
.owner = THIS_MODULE,
.open = rt_cache_seq_open,
.read = seq_read,
return &per_cpu(rt_cache_stat, cpu);
}
return NULL;
-
+
}
static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
return 0;
}
-
+
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
atomic_read(&ipv4_dst_ops.entries),
st->out_hit,
st->out_slow_tot,
- st->out_slow_mc,
+ st->out_slow_mc,
st->gc_total,
st->gc_ignored,
return 0;
}
-static struct seq_operations rt_cpu_seq_ops = {
+static const struct seq_operations rt_cpu_seq_ops = {
.start = rt_cpu_seq_start,
.next = rt_cpu_seq_next,
.stop = rt_cpu_seq_stop,
return seq_open(file, &rt_cpu_seq_ops);
}
-static struct file_operations rt_cpu_seq_fops = {
+static const struct file_operations rt_cpu_seq_fops = {
.owner = THIS_MODULE,
.open = rt_cpu_seq_open,
.read = seq_read,
};
#endif /* CONFIG_PROC_FS */
-
+
static __inline__ void rt_free(struct rtable *rt)
{
multipath_remove(rt);
/* Kill broadcast/multicast entries very aggresively, if they
collide in hash table with more useful entries */
return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rth->fl.iif && rth->u.rt_next;
+ rth->fl.iif && rth->u.dst.rt_next;
}
static __inline__ int rt_valuable(struct rtable *rth)
if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
compare_keys(&(*rthp)->fl, &expentry->fl)) {
if (*rthp == expentry) {
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
continue;
} else {
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
rt_free(rth);
if (removed_count)
++(*removed_count);
} else {
if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
passedexpired && !nextstep)
- nextstep = &rth->u.rt_next;
+ nextstep = &rth->u.dst.rt_next;
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
}
}
/* Entry is expired even if it is in use */
if (time_before_eq(now, rth->u.dst.expires)) {
tmo >>= 1;
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
continue;
}
} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
tmo >>= 1;
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
continue;
}
if (!rthp)
break;
} else {
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
rt_free(rth);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- *rthp = rth->u.rt_next;
- rt_free(rth);
+ *rthp = rth->u.dst.rt_next;
+ rt_free(rth);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
spin_unlock(rt_hash_lock_addr(i));
spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth; rth = next) {
- next = rth->u.rt_next;
+ next = rth->u.dst.rt_next;
rt_free(rth);
}
}
if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
tmo = 0;
-
+
if (delay > tmo)
delay = tmo;
}
while ((rth = *rthp) != NULL) {
if (!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
continue;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (!rthp)
break;
} else {
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
rt_free(rth);
goal--;
}
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
rt_free(rth);
goal--;
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
if (compare_keys(&rth->fl, &rt->fl)) {
#endif
/* Put it first */
- *rthp = rth->u.rt_next;
+ *rthp = rth->u.dst.rt_next;
/*
* Since lookup is lockfree, the deletion
* must be visible to another weakly ordered CPU before
* the insertion at the start of the hash chain.
*/
- rcu_assign_pointer(rth->u.rt_next,
+ rcu_assign_pointer(rth->u.dst.rt_next,
rt_hash_table[hash].chain);
/*
* Since lookup is lockfree, the update writes
chain_length++;
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
}
if (cand) {
* only 2 entries per bucket. We will see.
*/
if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->u.rt_next;
+ *candp = cand->u.dst.rt_next;
rt_free(cand);
}
}
}
}
- rt->u.rt_next = rt_hash_table[hash].chain;
+ rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
- if (rt->u.rt_next) {
+ if (rt->u.dst.rt_next) {
struct rtable *trt;
printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
NIPQUAD(rt->rt_dst));
- for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
+ for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
printk("\n");
}
return;
}
} else
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
+ printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
__builtin_return_address(0));
ip_select_fb_ident(iph);
spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
for (rthp = &rt_hash_table[hash].chain; *rthp;
- rthp = &(*rthp)->u.rt_next)
+ rthp = &(*rthp)->u.dst.rt_next)
if (*rthp == rt) {
- *rthp = rt->u.rt_next;
+ *rthp = rt->u.dst.rt_next;
rt_free(rt);
break;
}
rth->fl.fl4_src != skeys[i] ||
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0) {
- rthp = &rth->u.rt_next;
+ rthp = &rth->u.dst.rt_next;
continue;
}
/* Copy all the information. */
*rt = *rth;
- INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+ INIT_RCU_HEAD(&rt->u.dst.rcu_head);
rt->u.dst.__use = 1;
atomic_set(&rt->u.dst.__refcnt, 1);
rt->u.dst.child = NULL;
rt_drop(rt);
goto do_next;
}
-
+
netevent.old = &rth->u.dst;
netevent.new = &rt->u.dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
+ call_netevent_notifiers(NETEVENT_REDIRECT,
+ &netevent);
rt_del(hash, rth);
if (!rt_intern_hash(hash, rt, &rt))
#endif
}
out:
- in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static int ip_error(struct sk_buff *skb)
out: kfree_skb(skb);
return 0;
-}
+}
/*
* The last two values are not from the RFC but
static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
{
int i;
-
+
for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
if (old_mtu > mtu_plateau[i])
return mtu_plateau[i];
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
+ rth = rcu_dereference(rth->u.dst.rt_next)) {
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == skeys[i] &&
rth->rt_dst == daddr &&
mtu = guess_mtu(old_mtu);
}
if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
- if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
+ if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
dst_confirm(&rth->u.dst);
if (mtu < ip_rt_min_pmtu) {
mtu = ip_rt_min_pmtu;
static int ip_rt_bug(struct sk_buff *skb)
{
printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
- NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
+ NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
return 0;
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
+ rt->rt_type = res->type;
}
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
printk(KERN_WARNING "martian source %u.%u.%u.%u from "
"%u.%u.%u.%u, on dev %s\n",
NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
- if (dev->hard_header_len && skb->mac.raw) {
+ if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
int i;
- unsigned char *p = skb->mac.raw;
+ const unsigned char *p = skb_mac_header(skb);
printk(KERN_WARNING "ll header: ");
for (i = 0; i < dev->hard_header_len; i++, p++) {
printk("%02x", *p);
#endif
}
-static inline int __mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
- struct in_device *in_dev,
+static inline int __mkroute_input(struct sk_buff *skb,
+ struct fib_result* res,
+ struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
- struct rtable **result)
+ struct rtable **result)
{
struct rtable *rth;
}
- err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
+ err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
in_dev->dev, &spec_dst, &itag);
if (err < 0) {
- ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+ ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
-
+
err = -EINVAL;
goto cleanup;
}
/* release the working reference to the output device */
in_dev_put(out_dev);
return err;
-}
+}
-static inline int ip_mkroute_input_def(struct sk_buff *skb,
- struct fib_result* res,
+static inline int ip_mkroute_input_def(struct sk_buff *skb,
+ struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif);
- return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
}
-static inline int ip_mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
+static inline int ip_mkroute_input(struct sk_buff *skb,
+ struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
if (hopcount < 2)
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
saddr, tos);
-
+
/* add all alternatives to the routing cache */
for (hop = 0; hop < hopcount; hop++) {
res->nh_sel = hop;
goto e_nobufs;
if (err == -EINVAL)
goto e_inval;
-
+
done:
in_dev_put(in_dev);
if (free_res)
#endif
e_hostunreach:
- err = -EHOSTUNREACH;
- goto done;
+ err = -EHOSTUNREACH;
+ goto done;
e_inval:
err = -EINVAL;
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
+ rth = rcu_dereference(rth->u.dst.rt_next)) {
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == saddr &&
rth->fl.iif == iif &&
rcu_read_lock();
if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
- skb->nh.iph->protocol);
+ ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
|| (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
}
static inline int __mkroute_output(struct rtable **result,
- struct fib_result* res,
+ struct fib_result* res,
const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
{
struct rtable *rth;
struct in_device *in_dev;
}
} else if (res->type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST|RTCF_LOCAL;
- if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
+ if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
flags &= ~RTCF_LOCAL;
/* If multicast route do not exist use
if (!rth) {
err = -ENOBUFS;
goto cleanup;
- }
+ }
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
rth->rt_dst = fl->fl4_dst;
rth->rt_src = fl->fl4_src;
rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
- /* get references to the devices that are to be hold by the routing
+ /* get references to the devices that are to be hold by the routing
cache entry */
rth->u.dst.dev = dev_out;
dev_hold(dev_out);
}
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
rth->rt_spec_dst = fl->fl4_src;
- if (flags & RTCF_LOCAL &&
+ if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
rth->u.dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
err = rt_intern_hash(hash, rth, rp);
}
-
+
return err;
}
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(oldflp->fl4_src);
- if (dev_out == NULL)
+ if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind))
goto out;
/* I removed check for oif == dev_out->oif here.
of another iface. --ANK
*/
- if (oldflp->oif == 0
+ if (dev_out && oldflp->oif == 0
&& (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
rcu_read_lock_bh();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
+ rth = rcu_dereference(rth->u.dst.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
r = nlmsg_data(nlh);
r->rtm_family = AF_INET;
id = rt->peer->ip_id_count;
if (rt->peer->tcp_ts_stamp) {
ts = rt->peer->tcp_ts;
- tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
+ tsage = get_seconds() - rt->peer->tcp_ts_stamp;
}
}
return nlmsg_end(skb, nlh);
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
{
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
/* Reserve room for dummy headers, this skb can pass
through good chunk of routing engine.
*/
- skb->mac.raw = skb->nh.raw = skb->data;
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
- skb->nh.iph->protocol = IPPROTO_ICMP;
+ ip_hdr(skb)->protocol = IPPROTO_ICMP;
skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
s_idx = 0;
rcu_read_lock_bh();
for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference(rt->u.rt_next), idx++) {
+ rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
if (idx < s_idx)
continue;
skb->dst = dst_clone(&rt->u.dst);
if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
dst_release(xchg(&skb->dst, NULL));
rcu_read_unlock_bh();
proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
rt_cache_flush(flush_delay);
return 0;
- }
+ }
return -EINVAL;
}
if (newlen != sizeof(int))
return -EINVAL;
if (get_user(delay, (int __user *)newval))
- return -EFAULT;
- rt_cache_flush(delay);
+ return -EFAULT;
+ rt_cache_flush(delay);
return 0;
}
ctl_table ipv4_route_table[] = {
- {
+ {
.ctl_name = NET_IPV4_ROUTE_FLUSH,
.procname = "flush",
.data = &flush_delay,
},
{
/* Deprecated. Use gc_min_interval_ms */
-
+
.ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
.procname = "gc_min_interval",
.data = &ip_rt_gc_min_interval,
{
struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
- proc_net_stat))) {
+ !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
+ proc_net_stat))) {
return -ENOMEM;
}
rtstat_pde->proc_fops = &rt_cpu_seq_fops;
xfrm_init();
xfrm4_init();
#endif
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+
return rc;
}