[IPV4] fib_trie: rescan if key is lost during dump
[pandora-kernel.git] / net / ipv4 / fib_trie.c
index 1a9231f..35851c9 100644 (file)
@@ -447,7 +447,6 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 
        BUG_ON(i >= 1<<tn->bits);
 
-
        /* update emptyChildren */
        if (n == NULL && chi != NULL)
                tn->empty_children++;
@@ -1206,20 +1205,45 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
         * and we need to allocate a new one of those as well.
         */
 
-       if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
-               struct fib_alias *fa_orig;
+       if (fa && fa->fa_tos == tos &&
+           fa->fa_info->fib_priority == fi->fib_priority) {
+               struct fib_alias *fa_first, *fa_match;
 
                err = -EEXIST;
                if (cfg->fc_nlflags & NLM_F_EXCL)
                        goto out;
 
+               /* We have 2 goals:
+                * 1. Find exact match for type, scope, fib_info to avoid
+                * duplicate routes
+                * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+                */
+               fa_match = NULL;
+               fa_first = fa;
+               fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+               list_for_each_entry_continue(fa, fa_head, fa_list) {
+                       if (fa->fa_tos != tos)
+                               break;
+                       if (fa->fa_info->fib_priority != fi->fib_priority)
+                               break;
+                       if (fa->fa_type == cfg->fc_type &&
+                           fa->fa_scope == cfg->fc_scope &&
+                           fa->fa_info == fi) {
+                               fa_match = fa;
+                               break;
+                       }
+               }
+
                if (cfg->fc_nlflags & NLM_F_REPLACE) {
                        struct fib_info *fi_drop;
                        u8 state;
 
-                       if (fi->fib_treeref > 1)
+                       fa = fa_first;
+                       if (fa_match) {
+                               if (fa == fa_match)
+                                       err = 0;
                                goto out;
-
+                       }
                        err = -ENOBUFS;
                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
                        if (new_fa == NULL)
@@ -1231,7 +1255,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
                        new_fa->fa_type = cfg->fc_type;
                        new_fa->fa_scope = cfg->fc_scope;
                        state = fa->fa_state;
-                       new_fa->fa_state &= ~FA_S_ACCESSED;
+                       new_fa->fa_state = state & ~FA_S_ACCESSED;
 
                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
                        alias_free_mem_rcu(fa);
@@ -1248,20 +1272,11 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
                 * uses the same scope, type, and nexthop
                 * information.
                 */
-               fa_orig = fa;
-               list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
-                       if (fa->fa_tos != tos)
-                               break;
-                       if (fa->fa_info->fib_priority != fi->fib_priority)
-                               break;
-                       if (fa->fa_type == cfg->fc_type &&
-                           fa->fa_scope == cfg->fc_scope &&
-                           fa->fa_info == fi)
-                               goto out;
-               }
+               if (fa_match)
+                       goto out;
 
                if (!(cfg->fc_nlflags & NLM_F_APPEND))
-                       fa = fa_orig;
+                       fa = fa_first;
        }
        err = -ENOENT;
        if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1306,7 +1321,6 @@ err:
        return err;
 }
 
-
 /* should be called with rcu_read_lock */
 static int check_leaf(struct trie *t, struct leaf *l,
                      t_key key,  const struct flowi *flp,
@@ -1545,49 +1559,23 @@ found:
        return ret;
 }
 
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
 {
-       t_key cindex;
-       struct tnode *tp = NULL;
-       struct node *n = t->trie;
-       struct leaf *l;
+       struct tnode *tp = node_parent((struct node *) l);
 
-       pr_debug("entering trie_leaf_remove(%p)\n", n);
-
-       /* Note that in the case skipped bits, those bits are *not* checked!
-        * When we finish this, we will have NULL or a T_LEAF, and the
-        * T_LEAF may or may not match our key.
-        */
-
-       while (n != NULL && IS_TNODE(n)) {
-               struct tnode *tn = (struct tnode *) n;
-               check_tnode(tn);
-               n = tnode_get_child(tn, tkey_extract_bits(key,
-                                                         tn->pos, tn->bits));
-
-               BUG_ON(n && node_parent(n) != tn);
-       }
-       l = (struct leaf *) n;
-
-       if (!n || !tkey_equals(l->key, key))
-               return 0;
-
-       /*
-        * Key found.
-        * Remove the leaf and rebalance the tree
-        */
-       tp = node_parent(n);
-       tnode_free((struct tnode *) n);
+       pr_debug("entering trie_leaf_remove(%p)\n", l);
 
        if (tp) {
-               cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+               t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
                put_child(t, (struct tnode *)tp, cindex, NULL);
                rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
        } else
                rcu_assign_pointer(t->trie, NULL);
 
-       return 1;
+       tnode_free((struct tnode *) l);
 }
 
 /*
@@ -1628,9 +1616,8 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
 
        fa_to_delete = NULL;
-       fa_head = fa->fa_list.prev;
-
-       list_for_each_entry(fa, fa_head, fa_list) {
+       fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+       list_for_each_entry_continue(fa, fa_head, fa_list) {
                struct fib_info *fi = fa->fa_info;
 
                if (fa->fa_tos != tos)
@@ -1665,7 +1652,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
        }
 
        if (hlist_empty(&l->list))
-               trie_leaf_remove(t, key);
+               trie_leaf_remove(t, l);
 
        if (fa->fa_state & FA_S_ACCESSED)
                rt_cache_flush(-1);
@@ -1711,85 +1698,98 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
        return found;
 }
 
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
 {
-       struct node *c = (struct node *) thisleaf;
-       struct tnode *p;
-       int idx;
-       struct node *trie = rcu_dereference(t->trie);
-
-       if (c == NULL) {
-               if (trie == NULL)
-                       return NULL;
-
-               if (IS_LEAF(trie))          /* trie w. just a leaf */
-                       return (struct leaf *) trie;
+       do {
+               t_key idx;
 
-               p = (struct tnode *)trie;  /* Start */
-       } else
-               p = node_parent_rcu(c);
-
-       while (p) {
-               int pos, last;
-
-               /*  Find the next child of the parent */
                if (c)
-                       pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+                       idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
                else
-                       pos = 0;
-
-               last = 1 << p->bits;
-               for (idx = pos; idx < last ; idx++) {
-                       c = rcu_dereference(p->child[idx]);
+                       idx = 0;
 
+               while (idx < 1u << p->bits) {
+                       c = tnode_get_child_rcu(p, idx++);
                        if (!c)
                                continue;
 
-                       /* Decend if tnode */
-                       while (IS_TNODE(c)) {
-                               p = (struct tnode *) c;
-                               idx = 0;
-
-                               /* Rightmost non-NULL branch */
-                               if (p && IS_TNODE(p))
-                                       while (!(c = rcu_dereference(p->child[idx]))
-                                              && idx < (1<<p->bits)) idx++;
-
-                               /* Done with this tnode? */
-                               if (idx >= (1 << p->bits) || !c)
-                                       goto up;
+                       if (IS_LEAF(c)) {
+                               prefetch(p->child[idx]);
+                               return (struct leaf *) c;
                        }
-                       return (struct leaf *) c;
+
+                       /* Rescan start scanning in new node */
+                       p = (struct tnode *) c;
+                       idx = 0;
                }
-up:
-               /* No more children go up one step  */
+
+               /* Node empty, walk back up to parent */
                c = (struct node *) p;
-               p = node_parent_rcu(c);
+       } while ( (p = node_parent_rcu(c)) != NULL);
+
+       return NULL; /* Root of trie */
+}
+
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+       struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+
+       if (!n)
+               return NULL;
+
+       if (IS_LEAF(n))          /* trie is just a leaf */
+               return (struct leaf *) n;
+
+       return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+       struct node *c = (struct node *) l;
+       struct tnode *p = node_parent(c);
+
+       if (!p)
+               return NULL;    /* trie with just one leaf */
+
+       return leaf_walk_rcu(p, c);
+}
+
+static struct leaf *trie_leafindex(struct trie *t, int index)
+{
+       struct leaf *l = trie_firstleaf(t);
+
+       while (index-- > 0) {
+               l = trie_nextleaf(l);
+               if (!l)
+                       break;
        }
-       return NULL; /* Ready. Root of trie */
+       return l;
 }
 
+
 /*
  * Caller must hold RTNL.
  */
 static int fn_trie_flush(struct fib_table *tb)
 {
        struct trie *t = (struct trie *) tb->tb_data;
-       struct leaf *ll = NULL, *l = NULL;
-       int found = 0, h;
+       struct leaf *l, *ll = NULL;
+       int found = 0;
 
-       for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+       for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
                found += trie_flush_leaf(t, l);
 
                if (ll && hlist_empty(&ll->list))
-                       trie_leaf_remove(t, ll->key);
+                       trie_leaf_remove(t, ll);
                ll = l;
        }
 
        if (ll && hlist_empty(&ll->list))
-               trie_leaf_remove(t, ll->key);
+               trie_leaf_remove(t, ll);
 
        pr_debug("trie_flush found=%d\n", found);
        return found;
@@ -1874,10 +1874,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
 {
        int i, s_i;
        struct fib_alias *fa;
-
        __be32 xkey = htonl(key);
 
-       s_i = cb->args[4];
+       s_i = cb->args[5];
        i = 0;
 
        /* rcu_read_lock is hold by caller */
@@ -1887,7 +1886,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
                        i++;
                        continue;
                }
-               BUG_ON(!fa->fa_info);
 
                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
                                  cb->nlh->nlmsg_seq,
@@ -1898,76 +1896,90 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
                                  xkey,
                                  plen,
                                  fa->fa_tos,
-                                 fa->fa_info, 0) < 0) {
-                       cb->args[4] = i;
+                                 fa->fa_info, NLM_F_MULTI) < 0) {
+                       cb->args[5] = i;
                        return -1;
                }
                i++;
        }
-       cb->args[4] = i;
+       cb->args[5] = i;
        return skb->len;
 }
 
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
-                            struct sk_buff *skb, struct netlink_callback *cb)
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+                       struct sk_buff *skb, struct netlink_callback *cb)
 {
-       int h, s_h;
-       struct list_head *fa_head;
-       struct leaf *l = NULL;
+       struct leaf_info *li;
+       struct hlist_node *node;
+       int i, s_i;
 
-       s_h = cb->args[3];
+       s_i = cb->args[4];
+       i = 0;
 
-       for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
-               if (h < s_h)
+       /* rcu_read_lock is hold by caller */
+       hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+               if (i < s_i) {
+                       i++;
                        continue;
-               if (h > s_h)
-                       memset(&cb->args[4], 0,
-                              sizeof(cb->args) - 4*sizeof(cb->args[0]));
-
-               fa_head = get_fa_head(l, plen);
+               }
 
-               if (!fa_head)
-                       continue;
+               if (i > s_i)
+                       cb->args[5] = 0;
 
-               if (list_empty(fa_head))
+               if (list_empty(&li->falh))
                        continue;
 
-               if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
-                       cb->args[3] = h;
+               if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+                       cb->args[4] = i;
                        return -1;
                }
+               i++;
        }
-       cb->args[3] = h;
+
+       cb->args[4] = i;
        return skb->len;
 }
 
 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
                        struct netlink_callback *cb)
 {
-       int m, s_m;
+       struct leaf *l;
        struct trie *t = (struct trie *) tb->tb_data;
-
-       s_m = cb->args[2];
+       t_key key = cb->args[2];
+       int count = cb->args[3];
 
        rcu_read_lock();
-       for (m = 0; m <= 32; m++) {
-               if (m < s_m)
-                       continue;
-               if (m > s_m)
-                       memset(&cb->args[3], 0,
-                               sizeof(cb->args) - 3*sizeof(cb->args[0]));
+       /* Dump starting at last key.
+        * Note: 0.0.0.0/0 (ie default) is first key.
+        */
+       if (count == 0)
+               l = trie_firstleaf(t);
+       else {
+               /* Normally, continue from last key, but if that is missing
+                * fallback to using slow rescan
+                */
+               l = fib_find_node(t, key);
+               if (!l)
+                       l = trie_leafindex(t, count);
+       }
 
-               if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
-                       cb->args[2] = m;
-                       goto out;
+       while (l) {
+               cb->args[2] = l->key;
+               if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+                       cb->args[3] = count;
+                       rcu_read_unlock();
+                       return -1;
                }
+
+               ++count;
+               l = trie_nextleaf(l);
+               memset(&cb->args[4], 0,
+                      sizeof(cb->args) - 4*sizeof(cb->args[0]));
        }
+       cb->args[3] = count;
        rcu_read_unlock();
-       cb->args[2] = m;
+
        return skb->len;
-out:
-       rcu_read_unlock();
-       return -1;
 }
 
 void __init fib_hash_init(void)
@@ -2399,31 +2411,29 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 
        } else {
                struct leaf *l = (struct leaf *) n;
-               int i;
+               struct leaf_info *li;
+               struct hlist_node *node;
                __be32 val = htonl(l->key);
 
                seq_indent(seq, iter->depth);
                seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
-               for (i = 32; i >= 0; i--) {
-                       struct leaf_info *li = find_leaf_info(l, i);
-
-                       if (li) {
-                               struct fib_alias *fa;
-
-                               list_for_each_entry_rcu(fa, &li->falh, fa_list) {
-                                       char buf1[32], buf2[32];
-
-                                       seq_indent(seq, iter->depth+1);
-                                       seq_printf(seq, "  /%d %s %s", i,
-                                                  rtn_scope(buf1, sizeof(buf1),
-                                                            fa->fa_scope),
-                                                  rtn_type(buf2, sizeof(buf2),
-                                                            fa->fa_type));
-                                       if (fa->fa_tos)
-                                               seq_printf(seq, "tos =%d\n",
-                                                          fa->fa_tos);
-                                       seq_putc(seq, '\n');
-                               }
+
+               hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+                       struct fib_alias *fa;
+
+                       list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+                               char buf1[32], buf2[32];
+
+                               seq_indent(seq, iter->depth+1);
+                               seq_printf(seq, "  /%d %s %s", li->plen,
+                                          rtn_scope(buf1, sizeof(buf1),
+                                                    fa->fa_scope),
+                                          rtn_type(buf2, sizeof(buf2),
+                                                   fa->fa_type));
+                               if (fa->fa_tos)
+                                       seq_printf(seq, "tos =%d\n",
+                                                  fa->fa_tos);
+                               seq_putc(seq, '\n');
                        }
                }
        }
@@ -2477,8 +2487,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 {
        const struct fib_trie_iter *iter = seq->private;
        struct leaf *l = v;
-       int i;
-       char bf[128];
+       struct leaf_info *li;
+       struct hlist_node *node;
 
        if (v == SEQ_START_TOKEN) {
                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2493,20 +2503,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
        if (IS_TNODE(l))
                return 0;
 
-       for (i = 32; i >= 0; i--) {
-               struct leaf_info *li = find_leaf_info(l, i);
+       hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
                struct fib_alias *fa;
                __be32 mask, prefix;
 
-               if (!li)
-                       continue;
-
                mask = inet_make_mask(li->plen);
                prefix = htonl(l->key);
 
                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
                        const struct fib_info *fi = fa->fa_info;
                        unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+                       char bf[128];
 
                        if (fa->fa_type == RTN_BROADCAST
                            || fa->fa_type == RTN_MULTICAST)