Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
[pandora-kernel.git] / fs / xfs / linux-2.6 / xfs_buf.c
index 251bcdc..ba53128 100644 (file)
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
        atomic_set(&bp->b_hold, 1);
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
-       INIT_LIST_HEAD(&bp->b_hash_list);
-       init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+       RB_CLEAR_NODE(&bp->b_rbnode);
+       sema_init(&bp->b_sema, 0); /* held, no waiters */
        XB_SET_OWNER(bp);
        bp->b_target = target;
        bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
 
-       ASSERT(list_empty(&bp->b_hash_list));
-
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
 
@@ -422,8 +420,10 @@ _xfs_buf_find(
 {
        xfs_off_t               range_base;
        size_t                  range_length;
-       xfs_bufhash_t           *hash;
-       xfs_buf_t               *bp, *n;
+       struct xfs_perag        *pag;
+       struct rb_node          **rbp;
+       struct rb_node          *parent;
+       xfs_buf_t               *bp;
 
        range_base = (ioff << BBSHIFT);
        range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
        ASSERT(!(range_length < (1 << btp->bt_sshift)));
        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
-       hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-       spin_lock(&hash->bh_lock);
-
-       list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-               ASSERT(btp == bp->b_target);
-               if (bp->b_file_offset == range_base &&
-                   bp->b_buffer_length == range_length) {
+       /* get tree root */
+       pag = xfs_perag_get(btp->bt_mount,
+                               xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+       /* walk tree */
+       spin_lock(&pag->pag_buf_lock);
+       rbp = &pag->pag_buf_tree.rb_node;
+       parent = NULL;
+       bp = NULL;
+       while (*rbp) {
+               parent = *rbp;
+               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+               if (range_base < bp->b_file_offset)
+                       rbp = &(*rbp)->rb_left;
+               else if (range_base > bp->b_file_offset)
+                       rbp = &(*rbp)->rb_right;
+               else {
+                       /*
+                        * found a block offset match. If the range doesn't
+                        * match, the only way this is allowed is if the buffer
+                        * in the cache is stale and the transaction that made
+                        * it stale has not yet committed. i.e. we are
+                        * reallocating a busy extent. Skip this buffer and
+                        * continue searching to the right for an exact match.
+                        */
+                       if (bp->b_buffer_length != range_length) {
+                               ASSERT(bp->b_flags & XBF_STALE);
+                               rbp = &(*rbp)->rb_right;
+                               continue;
+                       }
                        atomic_inc(&bp->b_hold);
                        goto found;
                }
@@ -449,17 +472,21 @@ _xfs_buf_find(
        if (new_bp) {
                _xfs_buf_initialize(new_bp, btp, range_base,
                                range_length, flags);
-               new_bp->b_hash = hash;
-               list_add(&new_bp->b_hash_list, &hash->bh_list);
+               rb_link_node(&new_bp->b_rbnode, parent, rbp);
+               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+               /* the buffer keeps the perag reference until it is freed */
+               new_bp->b_pag = pag;
+               spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(xb_miss_locked);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
        }
-
-       spin_unlock(&hash->bh_lock);
        return new_bp;
 
 found:
-       spin_unlock(&hash->bh_lock);
+       spin_unlock(&pag->pag_buf_lock);
+       xfs_perag_put(pag);
 
        /* Attempt to get the semaphore without sleeping,
         * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
 xfs_buf_readahead(
        xfs_buftarg_t           *target,
        xfs_off_t               ioff,
-       size_t                  isize,
-       xfs_buf_flags_t         flags)
+       size_t                  isize)
 {
        struct backing_dev_info *bdi;
 
@@ -634,8 +660,8 @@ xfs_buf_readahead(
        if (bdi_read_congested(bdi))
                return;
 
-       flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-       xfs_buf_read(target, ioff, isize, flags);
+       xfs_buf_read(target, ioff, isize,
+                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
 }
 
 /*
@@ -664,7 +690,7 @@ xfs_buf_read_uncached(
        XFS_BUF_BUSY(bp);
 
        xfsbdstrat(mp, bp);
-       error = xfs_iowait(bp);
+       error = xfs_buf_iowait(bp);
        if (error || bp->b_error) {
                xfs_buf_relse(bp);
                return NULL;
@@ -809,27 +835,30 @@ void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
-       xfs_bufhash_t           *hash = bp->b_hash;
+       struct xfs_perag        *pag = bp->b_pag;
 
        trace_xfs_buf_rele(bp, _RET_IP_);
 
-       if (unlikely(!hash)) {
+       if (!pag) {
                ASSERT(!bp->b_relse);
+               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
                return;
        }
 
+       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
-                       spin_unlock(&hash->bh_lock);
-                       (*(bp->b_relse)) (bp);
+                       spin_unlock(&pag->pag_buf_lock);
+                       bp->b_relse(bp);
                } else {
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                       list_del_init(&bp->b_hash_list);
-                       spin_unlock(&hash->bh_lock);
+                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                       spin_unlock(&pag->pag_buf_lock);
+                       xfs_perag_put(pag);
                        xfs_buf_free(bp);
                }
        }
@@ -957,19 +986,7 @@ xfs_buf_iodone_work(
        xfs_buf_t               *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
 
-       /*
-        * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-        * ordered flag and reissue them.  Because we can't tell the higher
-        * layers directly that they should not issue ordered I/O anymore, they
-        * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-        */
-       if ((bp->b_error == EOPNOTSUPP) &&
-           (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-               trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-               bp->b_flags &= ~XBF_ORDERED;
-               bp->b_flags |= _XFS_BARRIER_FAILED;
-               xfs_buf_iorequest(bp);
-       } else if (bp->b_iodone)
+       if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
@@ -1043,7 +1060,7 @@ xfs_bdwrite(
 
 /*
  * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
  * so that the proper iodone callbacks get called.
  */
 STATIC int
@@ -1060,21 +1077,21 @@ xfs_bioerror(
        XFS_BUF_ERROR(bp, EIO);
 
        /*
-        * We're calling biodone, so delete XBF_DONE flag.
+        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
         */
        XFS_BUF_UNREAD(bp);
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
 
-       xfs_biodone(bp);
+       xfs_buf_ioend(bp, 0);
 
        return EIO;
 }
 
 /*
  * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
  * This is meant for userdata errors; metadata bufs come with
  * iodone functions attached, so that we can track down errors.
  */
@@ -1225,7 +1242,7 @@ _xfs_buf_ioapply(
 
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
-               rw = WRITE_BARRIER;
+               rw = WRITE_FLUSH_FUA;
        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1429,56 +1446,24 @@ xfs_buf_iomove(
  */
 void
 xfs_wait_buftarg(
-       xfs_buftarg_t   *btp)
+       struct xfs_buftarg      *btp)
 {
-       xfs_bufhash_t   *hash;
-       uint            i;
+       struct xfs_perag        *pag;
+       uint                    i;
 
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               hash = &btp->bt_hash[i];
-               spin_lock(&hash->bh_lock);
-               while (!list_empty(&hash->bh_list)) {
-                       spin_unlock(&hash->bh_lock);
+       for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+               pag = xfs_perag_get(btp->bt_mount, i);
+               spin_lock(&pag->pag_buf_lock);
+               while (rb_first(&pag->pag_buf_tree)) {
+                       spin_unlock(&pag->pag_buf_lock);
                        delay(100);
-                       spin_lock(&hash->bh_lock);
+                       spin_lock(&pag->pag_buf_lock);
                }
-               spin_unlock(&hash->bh_lock);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
        }
 }
 
-/*
- *     Allocate buffer hash table for a given target.
- *     For devices containing metadata (i.e. not the log/realtime devices)
- *     we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-       xfs_buftarg_t           *btp,
-       int                     external)
-{
-       unsigned int            i;
-
-       if (external) {
-               btp->bt_hash = NULL;
-               return;
-       }
-       btp->bt_hashshift = 12; /* 4096 buckets */
-       btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-                                        sizeof(xfs_bufhash_t));
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               spin_lock_init(&btp->bt_hash[i].bh_lock);
-               INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-       }
-}
-
-STATIC void
-xfs_free_bufhash(
-       xfs_buftarg_t           *btp)
-{
-       kmem_free_large(btp->bt_hash);
-       btp->bt_hash = NULL;
-}
-
 /*
  *     buftarg list for delwrite queue processing
  */
@@ -1511,7 +1496,6 @@ xfs_free_buftarg(
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-       xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
 
        /* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1635,6 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
-       xfs_alloc_bufhash(btp, external);
        return btp;
 
 error:
@@ -1942,7 +1925,7 @@ xfs_flush_buftarg(
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
                        list_del_init(&bp->b_list);
-                       xfs_iowait(bp);
+                       xfs_buf_iowait(bp);
                        xfs_buf_relse(bp);
                }
        }
@@ -1959,7 +1942,7 @@ xfs_buf_init(void)
                goto out;
 
        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                       WQ_RESCUER | WQ_HIGHPRI, 1);
+                                       WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;