Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

[pandora-kernel.git] / fs / xfs / linux-2.6 / xfs_buf.c
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c

index 286e36e..ba53128 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
         atomic_set(&bp->b_hold, 1);
         init_completion(&bp->b_iowait);
         INIT_LIST_HEAD(&bp->b_list);
-       INIT_LIST_HEAD(&bp->b_hash_list);
-       init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+       RB_CLEAR_NODE(&bp->b_rbnode);
+       sema_init(&bp->b_sema, 0); /* held, no waiters */
         XB_SET_OWNER(bp);
         bp->b_target = target;
         bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
  {
         trace_xfs_buf_free(bp, _RET_IP_);
  
-       ASSERT(list_empty(&bp->b_hash_list));
-
         if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                 uint            i;
  
@@ -422,8 +420,10 @@ _xfs_buf_find(
  {
         xfs_off_t               range_base;
         size_t                  range_length;
-       xfs_bufhash_t           *hash;
-       xfs_buf_t               *bp, *n;
+       struct xfs_perag        *pag;
+       struct rb_node          **rbp;
+       struct rb_node          *parent;
+       xfs_buf_t               *bp;
  
         range_base = (ioff << BBSHIFT);
         range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
         ASSERT(!(range_length < (1 << btp->bt_sshift)));
         ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
  
-       hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-       spin_lock(&hash->bh_lock);
-
-       list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-               ASSERT(btp == bp->b_target);
-               if (bp->b_file_offset == range_base &&
-                   bp->b_buffer_length == range_length) {
+       /* get tree root */
+       pag = xfs_perag_get(btp->bt_mount,
+                               xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+       /* walk tree */
+       spin_lock(&pag->pag_buf_lock);
+       rbp = &pag->pag_buf_tree.rb_node;
+       parent = NULL;
+       bp = NULL;
+       while (*rbp) {
+               parent = *rbp;
+               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+               if (range_base < bp->b_file_offset)
+                       rbp = &(*rbp)->rb_left;
+               else if (range_base > bp->b_file_offset)
+                       rbp = &(*rbp)->rb_right;
+               else {
+                       /*
+                        * found a block offset match. If the range doesn't
+                        * match, the only way this is allowed is if the buffer
+                        * in the cache is stale and the transaction that made
+                        * it stale has not yet committed. i.e. we are
+                        * reallocating a busy extent. Skip this buffer and
+                        * continue searching to the right for an exact match.
+                        */
+                       if (bp->b_buffer_length != range_length) {
+                               ASSERT(bp->b_flags & XBF_STALE);
+                               rbp = &(*rbp)->rb_right;
+                               continue;
+                       }
                         atomic_inc(&bp->b_hold);
                         goto found;
                 }
@@ -449,17 +472,21 @@ _xfs_buf_find(
         if (new_bp) {
                 _xfs_buf_initialize(new_bp, btp, range_base,
                                 range_length, flags);
-               new_bp->b_hash = hash;
-               list_add(&new_bp->b_hash_list, &hash->bh_list);
+               rb_link_node(&new_bp->b_rbnode, parent, rbp);
+               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+               /* the buffer keeps the perag reference until it is freed */
+               new_bp->b_pag = pag;
+               spin_unlock(&pag->pag_buf_lock);
         } else {
                 XFS_STATS_INC(xb_miss_locked);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
         }
-
-       spin_unlock(&hash->bh_lock);
         return new_bp;
  
  found:
-       spin_unlock(&hash->bh_lock);
+       spin_unlock(&pag->pag_buf_lock);
+       xfs_perag_put(pag);
  
         /* Attempt to get the semaphore without sleeping,
          * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
  xfs_buf_readahead(
         xfs_buftarg_t           *target,
         xfs_off_t               ioff,
-       size_t                  isize,
-       xfs_buf_flags_t         flags)
+       size_t                  isize)
  {
         struct backing_dev_info *bdi;
  
@@ -634,8 +660,42 @@ xfs_buf_readahead(
         if (bdi_read_congested(bdi))
                 return;
  
-       flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-       xfs_buf_read(target, ioff, isize, flags);
+       xfs_buf_read(target, ioff, isize,
+                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+       struct xfs_mount        *mp,
+       struct xfs_buftarg      *target,
+       xfs_daddr_t             daddr,
+       size_t                  length,
+       int                     flags)
+{
+       xfs_buf_t               *bp;
+       int                     error;
+
+       bp = xfs_buf_get_uncached(target, length, flags);
+       if (!bp)
+               return NULL;
+
+       /* set up the buffer for a read IO */
+       xfs_buf_lock(bp);
+       XFS_BUF_SET_ADDR(bp, daddr);
+       XFS_BUF_READ(bp);
+       XFS_BUF_BUSY(bp);
+
+       xfsbdstrat(mp, bp);
+       error = xfs_buf_iowait(bp);
+       if (error || bp->b_error) {
+               xfs_buf_relse(bp);
+               return NULL;
+       }
+       return bp;
  }
  
  xfs_buf_t *
@@ -707,9 +767,10 @@ xfs_buf_associate_memory(
  }
  
  xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+       struct xfs_buftarg      *target,
         size_t                  len,
-       xfs_buftarg_t           *target)
+       int                     flags)
  {
         unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
         int                     error, i;
@@ -725,7 +786,7 @@ xfs_buf_get_noaddr(
                 goto fail_free_buf;
  
         for (i = 0; i < page_count; i++) {
-               bp->b_pages[i] = alloc_page(GFP_KERNEL);
+               bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
                 if (!bp->b_pages[i])
                         goto fail_free_mem;
         }
@@ -740,7 +801,7 @@ xfs_buf_get_noaddr(
  
         xfs_buf_unlock(bp);
  
-       trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+       trace_xfs_buf_get_uncached(bp, _RET_IP_);
         return bp;
  
   fail_free_mem:
@@ -774,29 +835,30 @@ void
  xfs_buf_rele(
         xfs_buf_t               *bp)
  {
-       xfs_bufhash_t           *hash = bp->b_hash;
+       struct xfs_perag        *pag = bp->b_pag;
  
         trace_xfs_buf_rele(bp, _RET_IP_);
  
-       if (unlikely(!hash)) {
+       if (!pag) {
                 ASSERT(!bp->b_relse);
+               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                 if (atomic_dec_and_test(&bp->b_hold))
                         xfs_buf_free(bp);
                 return;
         }
  
+       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
         ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                 if (bp->b_relse) {
                         atomic_inc(&bp->b_hold);
-                       spin_unlock(&hash->bh_lock);
-                       (*(bp->b_relse)) (bp);
-               } else if (bp->b_flags & XBF_FS_MANAGED) {
-                       spin_unlock(&hash->bh_lock);
+                       spin_unlock(&pag->pag_buf_lock);
+                       bp->b_relse(bp);
                 } else {
                         ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                       list_del_init(&bp->b_hash_list);
-                       spin_unlock(&hash->bh_lock);
+                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                       spin_unlock(&pag->pag_buf_lock);
+                       xfs_perag_put(pag);
                         xfs_buf_free(bp);
                 }
         }
@@ -859,7 +921,7 @@ xfs_buf_lock(
         trace_xfs_buf_lock(bp, _RET_IP_);
  
         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-               xfs_log_force(bp->b_mount, 0);
+               xfs_log_force(bp->b_target->bt_mount, 0);
         if (atomic_read(&bp->b_io_remaining))
                 blk_run_address_space(bp->b_target->bt_mapping);
         down(&bp->b_sema);
@@ -924,19 +986,7 @@ xfs_buf_iodone_work(
         xfs_buf_t               *bp =
                 container_of(work, xfs_buf_t, b_iodone_work);
  
-       /*
-        * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-        * ordered flag and reissue them.  Because we can't tell the higher
-        * layers directly that they should not issue ordered I/O anymore, they
-        * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-        */
-       if ((bp->b_error == EOPNOTSUPP) &&
-           (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-               trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-               bp->b_flags &= ~XBF_ORDERED;
-               bp->b_flags |= _XFS_BARRIER_FAILED;
-               xfs_buf_iorequest(bp);
-       } else if (bp->b_iodone)
+       if (bp->b_iodone)
                 (*(bp->b_iodone))(bp);
         else if (bp->b_flags & XBF_ASYNC)
                 xfs_buf_relse(bp);
@@ -982,7 +1032,6 @@ xfs_bwrite(
  {
         int                     error;
  
-       bp->b_mount = mp;
         bp->b_flags |= XBF_WRITE;
         bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
  
@@ -1003,8 +1052,6 @@ xfs_bdwrite(
  {
         trace_xfs_buf_bdwrite(bp, _RET_IP_);
  
-       bp->b_mount = mp;
-
         bp->b_flags &= ~XBF_READ;
         bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
  
@@ -1013,7 +1060,7 @@ xfs_bdwrite(
  
  /*
   * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
   * so that the proper iodone callbacks get called.
   */
  STATIC int
@@ -1030,21 +1077,21 @@ xfs_bioerror(
         XFS_BUF_ERROR(bp, EIO);
  
         /*
-        * We're calling biodone, so delete XBF_DONE flag.
+        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
          */
         XFS_BUF_UNREAD(bp);
         XFS_BUF_UNDELAYWRITE(bp);
         XFS_BUF_UNDONE(bp);
         XFS_BUF_STALE(bp);
  
-       xfs_biodone(bp);
+       xfs_buf_ioend(bp, 0);
  
         return EIO;
  }
  
  /*
   * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
   * This is meant for userdata errors; metadata bufs come with
   * iodone functions attached, so that we can track down errors.
   */
@@ -1093,7 +1140,7 @@ int
  xfs_bdstrat_cb(
         struct xfs_buf  *bp)
  {
-       if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+       if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
                 trace_xfs_bdstrat_shut(bp, _RET_IP_);
                 /*
                  * Metadata write that didn't get logged but
@@ -1195,7 +1242,7 @@ _xfs_buf_ioapply(
  
         if (bp->b_flags & XBF_ORDERED) {
                 ASSERT(!(bp->b_flags & XBF_READ));
-               rw = WRITE_BARRIER;
+               rw = WRITE_FLUSH_FUA;
         } else if (bp->b_flags & XBF_LOG_BUFFER) {
                 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1399,62 +1446,24 @@ xfs_buf_iomove(
   */
  void
  xfs_wait_buftarg(
-       xfs_buftarg_t   *btp)
-{
-       xfs_buf_t       *bp, *n;
-       xfs_bufhash_t   *hash;
-       uint            i;
-
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               hash = &btp->bt_hash[i];
-again:
-               spin_lock(&hash->bh_lock);
-               list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-                       ASSERT(btp == bp->b_target);
-                       if (!(bp->b_flags & XBF_FS_MANAGED)) {
-                               spin_unlock(&hash->bh_lock);
-                               /*
-                                * Catch superblock reference count leaks
-                                * immediately
-                                */
-                               BUG_ON(bp->b_bn == 0);
-                               delay(100);
-                               goto again;
-                       }
-               }
-               spin_unlock(&hash->bh_lock);
-       }
-}
-
-/*
- *     Allocate buffer hash table for a given target.
- *     For devices containing metadata (i.e. not the log/realtime devices)
- *     we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-       xfs_buftarg_t           *btp,
-       int                     external)
+       struct xfs_buftarg      *btp)
  {
-       unsigned int            i;
+       struct xfs_perag        *pag;
+       uint                    i;
  
-       btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
-       btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-                                        sizeof(xfs_bufhash_t));
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               spin_lock_init(&btp->bt_hash[i].bh_lock);
-               INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+       for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+               pag = xfs_perag_get(btp->bt_mount, i);
+               spin_lock(&pag->pag_buf_lock);
+               while (rb_first(&pag->pag_buf_tree)) {
+                       spin_unlock(&pag->pag_buf_lock);
+                       delay(100);
+                       spin_lock(&pag->pag_buf_lock);
+               }
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
         }
  }
  
-STATIC void
-xfs_free_bufhash(
-       xfs_buftarg_t           *btp)
-{
-       kmem_free_large(btp->bt_hash);
-       btp->bt_hash = NULL;
-}
-
  /*
   *     buftarg list for delwrite queue processing
   */
@@ -1487,7 +1496,6 @@ xfs_free_buftarg(
         xfs_flush_buftarg(btp, 1);
         if (mp->m_flags & XFS_MOUNT_BARRIER)
                 xfs_blkdev_issue_flush(btp);
-       xfs_free_bufhash(btp);
         iput(btp->bt_mapping->host);
  
         /* Unregister the buftarg first so that we don't get a
@@ -1609,6 +1617,7 @@ out_error:
  
  xfs_buftarg_t *
  xfs_alloc_buftarg(
+       struct xfs_mount        *mp,
         struct block_device     *bdev,
         int                     external,
         const char              *fsname)
@@ -1617,6 +1626,7 @@ xfs_alloc_buftarg(
  
         btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
  
+       btp->bt_mount = mp;
         btp->bt_dev =  bdev->bd_dev;
         btp->bt_bdev = bdev;
         if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1625,7 +1635,6 @@ xfs_alloc_buftarg(
                 goto error;
         if (xfs_alloc_delwrite_queue(btp, fsname))
                 goto error;
-       xfs_alloc_bufhash(btp, external);
         return btp;
  
  error:
@@ -1916,7 +1925,7 @@ xfs_flush_buftarg(
                         bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  
                         list_del_init(&bp->b_list);
-                       xfs_iowait(bp);
+                       xfs_buf_iowait(bp);
                         xfs_buf_relse(bp);
                 }
         }
@@ -1933,7 +1942,7 @@ xfs_buf_init(void)
                 goto out;
  
         xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                       WQ_RESCUER | WQ_HIGHPRI, 1);
+                                       WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
         if (!xfslogd_workqueue)
                 goto out_free_buf_zone;