Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

[pandora-kernel.git] / fs / xfs / linux-2.6 / xfs_buf.c
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c

index 251bcdc..ba53128 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
         atomic_set(&bp->b_hold, 1);
         init_completion(&bp->b_iowait);
         INIT_LIST_HEAD(&bp->b_list);
-       INIT_LIST_HEAD(&bp->b_hash_list);
-       init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+       RB_CLEAR_NODE(&bp->b_rbnode);
+       sema_init(&bp->b_sema, 0); /* held, no waiters */
         XB_SET_OWNER(bp);
         bp->b_target = target;
         bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
  {
         trace_xfs_buf_free(bp, _RET_IP_);
  
-       ASSERT(list_empty(&bp->b_hash_list));
-
         if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                 uint            i;
  
@@ -422,8 +420,10 @@ _xfs_buf_find(
  {
         xfs_off_t               range_base;
         size_t                  range_length;
-       xfs_bufhash_t           *hash;
-       xfs_buf_t               *bp, *n;
+       struct xfs_perag        *pag;
+       struct rb_node          **rbp;
+       struct rb_node          *parent;
+       xfs_buf_t               *bp;
  
         range_base = (ioff << BBSHIFT);
         range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
         ASSERT(!(range_length < (1 << btp->bt_sshift)));
         ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
  
-       hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-       spin_lock(&hash->bh_lock);
-
-       list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-               ASSERT(btp == bp->b_target);
-               if (bp->b_file_offset == range_base &&
-                   bp->b_buffer_length == range_length) {
+       /* get tree root */
+       pag = xfs_perag_get(btp->bt_mount,
+                               xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+       /* walk tree */
+       spin_lock(&pag->pag_buf_lock);
+       rbp = &pag->pag_buf_tree.rb_node;
+       parent = NULL;
+       bp = NULL;
+       while (*rbp) {
+               parent = *rbp;
+               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+               if (range_base < bp->b_file_offset)
+                       rbp = &(*rbp)->rb_left;
+               else if (range_base > bp->b_file_offset)
+                       rbp = &(*rbp)->rb_right;
+               else {
+                       /*
+                        * found a block offset match. If the range doesn't
+                        * match, the only way this is allowed is if the buffer
+                        * in the cache is stale and the transaction that made
+                        * it stale has not yet committed. i.e. we are
+                        * reallocating a busy extent. Skip this buffer and
+                        * continue searching to the right for an exact match.
+                        */
+                       if (bp->b_buffer_length != range_length) {
+                               ASSERT(bp->b_flags & XBF_STALE);
+                               rbp = &(*rbp)->rb_right;
+                               continue;
+                       }
                         atomic_inc(&bp->b_hold);
                         goto found;
                 }
@@ -449,17 +472,21 @@ _xfs_buf_find(
         if (new_bp) {
                 _xfs_buf_initialize(new_bp, btp, range_base,
                                 range_length, flags);
-               new_bp->b_hash = hash;
-               list_add(&new_bp->b_hash_list, &hash->bh_list);
+               rb_link_node(&new_bp->b_rbnode, parent, rbp);
+               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+               /* the buffer keeps the perag reference until it is freed */
+               new_bp->b_pag = pag;
+               spin_unlock(&pag->pag_buf_lock);
         } else {
                 XFS_STATS_INC(xb_miss_locked);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
         }
-
-       spin_unlock(&hash->bh_lock);
         return new_bp;
  
  found:
-       spin_unlock(&hash->bh_lock);
+       spin_unlock(&pag->pag_buf_lock);
+       xfs_perag_put(pag);
  
         /* Attempt to get the semaphore without sleeping,
          * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
  xfs_buf_readahead(
         xfs_buftarg_t           *target,
         xfs_off_t               ioff,
-       size_t                  isize,
-       xfs_buf_flags_t         flags)
+       size_t                  isize)
  {
         struct backing_dev_info *bdi;
  
@@ -634,8 +660,8 @@ xfs_buf_readahead(
         if (bdi_read_congested(bdi))
                 return;
  
-       flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-       xfs_buf_read(target, ioff, isize, flags);
+       xfs_buf_read(target, ioff, isize,
+                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
  }
  
  /*
@@ -664,7 +690,7 @@ xfs_buf_read_uncached(
         XFS_BUF_BUSY(bp);
  
         xfsbdstrat(mp, bp);
-       error = xfs_iowait(bp);
+       error = xfs_buf_iowait(bp);
         if (error || bp->b_error) {
                 xfs_buf_relse(bp);
                 return NULL;
@@ -809,27 +835,30 @@ void
  xfs_buf_rele(
         xfs_buf_t               *bp)
  {
-       xfs_bufhash_t           *hash = bp->b_hash;
+       struct xfs_perag        *pag = bp->b_pag;
  
         trace_xfs_buf_rele(bp, _RET_IP_);
  
-       if (unlikely(!hash)) {
+       if (!pag) {
                 ASSERT(!bp->b_relse);
+               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                 if (atomic_dec_and_test(&bp->b_hold))
                         xfs_buf_free(bp);
                 return;
         }
  
+       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
         ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                 if (bp->b_relse) {
                         atomic_inc(&bp->b_hold);
-                       spin_unlock(&hash->bh_lock);
-                       (*(bp->b_relse)) (bp);
+                       spin_unlock(&pag->pag_buf_lock);
+                       bp->b_relse(bp);
                 } else {
                         ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                       list_del_init(&bp->b_hash_list);
-                       spin_unlock(&hash->bh_lock);
+                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                       spin_unlock(&pag->pag_buf_lock);
+                       xfs_perag_put(pag);
                         xfs_buf_free(bp);
                 }
         }
@@ -957,19 +986,7 @@ xfs_buf_iodone_work(
         xfs_buf_t               *bp =
                 container_of(work, xfs_buf_t, b_iodone_work);
  
-       /*
-        * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-        * ordered flag and reissue them.  Because we can't tell the higher
-        * layers directly that they should not issue ordered I/O anymore, they
-        * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-        */
-       if ((bp->b_error == EOPNOTSUPP) &&
-           (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-               trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-               bp->b_flags &= ~XBF_ORDERED;
-               bp->b_flags |= _XFS_BARRIER_FAILED;
-               xfs_buf_iorequest(bp);
-       } else if (bp->b_iodone)
+       if (bp->b_iodone)
                 (*(bp->b_iodone))(bp);
         else if (bp->b_flags & XBF_ASYNC)
                 xfs_buf_relse(bp);
@@ -1043,7 +1060,7 @@ xfs_bdwrite(
  
  /*
   * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
   * so that the proper iodone callbacks get called.
   */
  STATIC int
@@ -1060,21 +1077,21 @@ xfs_bioerror(
         XFS_BUF_ERROR(bp, EIO);
  
         /*
-        * We're calling biodone, so delete XBF_DONE flag.
+        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
          */
         XFS_BUF_UNREAD(bp);
         XFS_BUF_UNDELAYWRITE(bp);
         XFS_BUF_UNDONE(bp);
         XFS_BUF_STALE(bp);
  
-       xfs_biodone(bp);
+       xfs_buf_ioend(bp, 0);
  
         return EIO;
  }
  
  /*
   * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
   * This is meant for userdata errors; metadata bufs come with
   * iodone functions attached, so that we can track down errors.
   */
@@ -1225,7 +1242,7 @@ _xfs_buf_ioapply(
  
         if (bp->b_flags & XBF_ORDERED) {
                 ASSERT(!(bp->b_flags & XBF_READ));
-               rw = WRITE_BARRIER;
+               rw = WRITE_FLUSH_FUA;
         } else if (bp->b_flags & XBF_LOG_BUFFER) {
                 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1429,56 +1446,24 @@ xfs_buf_iomove(
   */
  void
  xfs_wait_buftarg(
-       xfs_buftarg_t   *btp)
+       struct xfs_buftarg      *btp)
  {
-       xfs_bufhash_t   *hash;
-       uint            i;
+       struct xfs_perag        *pag;
+       uint                    i;
  
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               hash = &btp->bt_hash[i];
-               spin_lock(&hash->bh_lock);
-               while (!list_empty(&hash->bh_list)) {
-                       spin_unlock(&hash->bh_lock);
+       for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+               pag = xfs_perag_get(btp->bt_mount, i);
+               spin_lock(&pag->pag_buf_lock);
+               while (rb_first(&pag->pag_buf_tree)) {
+                       spin_unlock(&pag->pag_buf_lock);
                         delay(100);
-                       spin_lock(&hash->bh_lock);
+                       spin_lock(&pag->pag_buf_lock);
                 }
-               spin_unlock(&hash->bh_lock);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
         }
  }
  
-/*
- *     Allocate buffer hash table for a given target.
- *     For devices containing metadata (i.e. not the log/realtime devices)
- *     we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-       xfs_buftarg_t           *btp,
-       int                     external)
-{
-       unsigned int            i;
-
-       if (external) {
-               btp->bt_hash = NULL;
-               return;
-       }
-       btp->bt_hashshift = 12; /* 4096 buckets */
-       btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-                                        sizeof(xfs_bufhash_t));
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               spin_lock_init(&btp->bt_hash[i].bh_lock);
-               INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-       }
-}
-
-STATIC void
-xfs_free_bufhash(
-       xfs_buftarg_t           *btp)
-{
-       kmem_free_large(btp->bt_hash);
-       btp->bt_hash = NULL;
-}
-
  /*
   *     buftarg list for delwrite queue processing
   */
@@ -1511,7 +1496,6 @@ xfs_free_buftarg(
         xfs_flush_buftarg(btp, 1);
         if (mp->m_flags & XFS_MOUNT_BARRIER)
                 xfs_blkdev_issue_flush(btp);
-       xfs_free_bufhash(btp);
         iput(btp->bt_mapping->host);
  
         /* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1635,6 @@ xfs_alloc_buftarg(
                 goto error;
         if (xfs_alloc_delwrite_queue(btp, fsname))
                 goto error;
-       xfs_alloc_bufhash(btp, external);
         return btp;
  
  error:
@@ -1942,7 +1925,7 @@ xfs_flush_buftarg(
                         bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  
                         list_del_init(&bp->b_list);
-                       xfs_iowait(bp);
+                       xfs_buf_iowait(bp);
                         xfs_buf_relse(bp);
                 }
         }
@@ -1959,7 +1942,7 @@ xfs_buf_init(void)
                 goto out;
  
         xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                       WQ_RESCUER | WQ_HIGHPRI, 1);
+                                       WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
         if (!xfslogd_workqueue)
                 goto out_free_buf_zone;