mpc52xx: cleanup locking
[pandora-kernel.git] / lib / radix-tree.c
index e907858..5086bb9 100644 (file)
@@ -49,7 +49,7 @@ struct radix_tree_node {
        unsigned int    height;         /* Height from the bottom */
        unsigned int    count;
        struct rcu_head rcu_head;
-       void            *slots[RADIX_TREE_MAP_SIZE];
+       void __rcu      *slots[RADIX_TREE_MAP_SIZE];
        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
@@ -82,6 +82,16 @@ struct radix_tree_preload {
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
+static inline void *ptr_to_indirect(void *ptr)
+{
+       return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+}
+
+static inline void *indirect_to_ptr(void *ptr)
+{
+       return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+}
+
 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
 {
        return root->gfp_mask & __GFP_BITS_MASK;
@@ -174,14 +184,16 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
 {
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);
+       int i;
 
        /*
         * must only free zeroed nodes into the slab. radix_tree_shrink
         * can leave us with a non-NULL entry in the first slot, so clear
         * that here to make sure.
         */
-       tag_clear(node, 0, 0);
-       tag_clear(node, 1, 0);
+       for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
+               tag_clear(node, i, 0);
+
        node->slots[0] = NULL;
        node->count = 0;
 
@@ -263,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
                        return -ENOMEM;
 
                /* Increase the height.  */
-               node->slots[0] = radix_tree_indirect_to_ptr(root->rnode);
+               node->slots[0] = indirect_to_ptr(root->rnode);
 
                /* Propagate the aggregated tag info into the new root */
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -274,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
                newheight = root->height+1;
                node->height = newheight;
                node->count = 1;
-               node = radix_tree_ptr_to_indirect(node);
+               node = ptr_to_indirect(node);
                rcu_assign_pointer(root->rnode, node);
                root->height = newheight;
        } while (height > root->height);
@@ -307,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
                        return error;
        }
 
-       slot = radix_tree_indirect_to_ptr(root->rnode);
+       slot = indirect_to_ptr(root->rnode);
 
        height = root->height;
        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
@@ -323,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
                                rcu_assign_pointer(node->slots[offset], slot);
                                node->count++;
                        } else
-                               rcu_assign_pointer(root->rnode,
-                                       radix_tree_ptr_to_indirect(slot));
+                               rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
                }
 
                /* Go a level down */
@@ -372,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
                        return NULL;
                return is_slot ? (void *)&root->rnode : node;
        }
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        height = node->height;
        if (index > radix_tree_maxindex(height))
@@ -391,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
                height--;
        } while (height > 0);
 
-       return is_slot ? (void *)slot:node;
+       return is_slot ? (void *)slot : indirect_to_ptr(node);
 }
 
 /**
@@ -453,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
        height = root->height;
        BUG_ON(index > radix_tree_maxindex(height));
 
-       slot = radix_tree_indirect_to_ptr(root->rnode);
+       slot = indirect_to_ptr(root->rnode);
        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
        while (height > 0) {
@@ -507,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 
        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
        pathp->node = NULL;
-       slot = radix_tree_indirect_to_ptr(root->rnode);
+       slot = indirect_to_ptr(root->rnode);
 
        while (height > 0) {
                int offset;
@@ -577,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 
        if (!radix_tree_is_indirect_ptr(node))
                return (index == 0);
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        height = node->height;
        if (index > radix_tree_maxindex(height))
@@ -623,17 +634,30 @@ EXPORT_SYMBOL(radix_tree_tag_get);
  * also settag. The function stops either after tagging nr_to_tag items or
  * after reaching last_index.
  *
+ * The tags must be set from the leaf level only and propagated back up the
+ * path to the root. We must do this so that we resolve the full path before
+ * setting any tags on intermediate nodes. If we set tags as we descend, then
+ * we can get to the leaf node and find that the index that has the iftag
+ * set is outside the range we are scanning. This reults in dangling tags and
+ * can lead to problems with later tag operations (e.g. livelocks on lookups).
+ *
  * The function returns number of leaves where the tag was set and sets
  * *first_indexp to the first unscanned index.
+ * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
+ * be prepared to handle that.
  */
 unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
                unsigned long *first_indexp, unsigned long last_index,
                unsigned long nr_to_tag,
                unsigned int iftag, unsigned int settag)
 {
-       unsigned int height = root->height, shift;
-       unsigned long tagged = 0, index = *first_indexp;
-       struct radix_tree_node *open_slots[height], *slot;
+       unsigned int height = root->height;
+       struct radix_tree_path path[height];
+       struct radix_tree_path *pathp = path;
+       struct radix_tree_node *slot;
+       unsigned int shift;
+       unsigned long tagged = 0;
+       unsigned long index = *first_indexp;
 
        last_index = min(last_index, radix_tree_maxindex(height));
        if (index > last_index)
@@ -651,7 +675,14 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
        }
 
        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-       slot = radix_tree_indirect_to_ptr(root->rnode);
+       slot = indirect_to_ptr(root->rnode);
+
+       /*
+        * we fill the path from (root->height - 2) to 0, leaving the index at
+        * (root->height - 1) as a terminator. Zero the node in the terminator
+        * so that we can use this to end walk loops back up the path.
+        */
+       path[height - 1].node = NULL;
 
        for (;;) {
                int offset;
@@ -661,21 +692,35 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
                        goto next;
                if (!tag_get(slot, iftag, offset))
                        goto next;
+               if (height > 1) {
+                       /* Go down one level */
+                       height--;
+                       shift -= RADIX_TREE_MAP_SHIFT;
+                       path[height - 1].node = slot;
+                       path[height - 1].offset = offset;
+                       slot = slot->slots[offset];
+                       continue;
+               }
+
+               /* tag the leaf */
+               tagged++;
                tag_set(slot, settag, offset);
-               if (height == 1) {
-                       tagged++;
-                       goto next;
+
+               /* walk back up the path tagging interior nodes */
+               pathp = &path[0];
+               while (pathp->node) {
+                       /* stop if we find a node with the tag already set */
+                       if (tag_get(pathp->node, settag, pathp->offset))
+                               break;
+                       tag_set(pathp->node, settag, pathp->offset);
+                       pathp++;
                }
-               /* Go down one level */
-               height--;
-               shift -= RADIX_TREE_MAP_SHIFT;
-               open_slots[height] = slot;
-               slot = slot->slots[offset];
-               continue;
+
 next:
                /* Go to next item at level determined by 'shift' */
                index = ((index >> shift) + 1) << shift;
-               if (index > last_index)
+               /* Overflow can happen when last_index is ~0UL... */
+               if (index > last_index || !index)
                        break;
                if (tagged >= nr_to_tag)
                        break;
@@ -685,7 +730,7 @@ next:
                         * last_index is guaranteed to be in the tree, what
                         * we do below cannot wander astray.
                         */
-                       slot = open_slots[height];
+                       slot = path[height - 1].node;
                        height++;
                        shift += RADIX_TREE_MAP_SHIFT;
                }
@@ -861,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
                results[0] = node;
                return 1;
        }
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        max_index = radix_tree_maxindex(node->height);
 
@@ -880,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
                        slot = *(((void ***)results)[ret + i]);
                        if (!slot)
                                continue;
-                       results[ret + nr_found] = rcu_dereference_raw(slot);
+                       results[ret + nr_found] =
+                               indirect_to_ptr(rcu_dereference_raw(slot));
                        nr_found++;
                }
                ret += nr_found;
@@ -929,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
                results[0] = (void **)&root->rnode;
                return 1;
        }
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        max_index = radix_tree_maxindex(node->height);
 
@@ -1054,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
                results[0] = node;
                return 1;
        }
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        max_index = radix_tree_maxindex(node->height);
 
@@ -1073,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
                        slot = *(((void ***)results)[ret + i]);
                        if (!slot)
                                continue;
-                       results[ret + nr_found] = rcu_dereference_raw(slot);
+                       results[ret + nr_found] =
+                               indirect_to_ptr(rcu_dereference_raw(slot));
                        nr_found++;
                }
                ret += nr_found;
@@ -1123,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
                results[0] = (void **)&root->rnode;
                return 1;
        }
-       node = radix_tree_indirect_to_ptr(node);
+       node = indirect_to_ptr(node);
 
        max_index = radix_tree_maxindex(node->height);
 
@@ -1159,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
                void *newptr;
 
                BUG_ON(!radix_tree_is_indirect_ptr(to_free));
-               to_free = radix_tree_indirect_to_ptr(to_free);
+               to_free = indirect_to_ptr(to_free);
 
                /*
                 * The candidate node has more than one child, or its child
@@ -1172,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 
                /*
                 * We don't need rcu_assign_pointer(), since we are simply
-                * moving the node from one part of the tree to another. If
-                * it was safe to dereference the old pointer to it
+                * moving the node from one part of the tree to another: if it
+                * was safe to dereference the old pointer to it
                 * (to_free->slots[0]), it will be safe to dereference the new
-                * one (root->rnode).
+                * one (root->rnode) as far as dependent read barriers go.
                 */
                newptr = to_free->slots[0];
                if (root->height > 1)
-                       newptr = radix_tree_ptr_to_indirect(newptr);
+                       newptr = ptr_to_indirect(newptr);
                root->rnode = newptr;
                root->height--;
+
+               /*
+                * We have a dilemma here. The node's slot[0] must not be
+                * NULLed in case there are concurrent lookups expecting to
+                * find the item. However if this was a bottom-level node,
+                * then it may be subject to the slot pointer being visible
+                * to callers dereferencing it. If item corresponding to
+                * slot[0] is subsequently deleted, these callers would expect
+                * their slot to become empty sooner or later.
+                *
+                * For example, lockless pagecache will look up a slot, deref
+                * the page pointer, and if the page is 0 refcount it means it
+                * was concurrently deleted from pagecache so try the deref
+                * again. Fortunately there is already a requirement for logic
+                * to retry the entire slot lookup -- the indirect pointer
+                * problem (replacing direct root node with an indirect pointer
+                * also results in a stale slot). So tag the slot as indirect
+                * to force callers to retry.
+                */
+               if (root->height == 0)
+                       *((unsigned long *)&to_free->slots[0]) |=
+                                               RADIX_TREE_INDIRECT_PTR;
+
                radix_tree_node_free(to_free);
        }
 }
@@ -1218,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
                root->rnode = NULL;
                goto out;
        }
-       slot = radix_tree_indirect_to_ptr(slot);
+       slot = indirect_to_ptr(slot);
 
        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
        pathp->node = NULL;
@@ -1260,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
                        radix_tree_node_free(to_free);
 
                if (pathp->node->count) {
-                       if (pathp->node ==
-                                       radix_tree_indirect_to_ptr(root->rnode))
+                       if (pathp->node == indirect_to_ptr(root->rnode))
                                radix_tree_shrink(root);
                        goto out;
                }