Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Aug 2010 21:07:20 +0000 (14:07 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Aug 2010 21:07:20 +0000 (14:07 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  ceph: fix get_ticket_handler() error handling
  ceph: don't BUG on ENOMEM during mds reconnect
  ceph: ceph_mdsc_build_path() returns an ERR_PTR
  ceph: Fix warnings
  ceph: ceph_get_inode() returns an ERR_PTR
  ceph: initialize fields on new dentry_infos
  ceph: maintain i_head_snapc when any caps are dirty, not just for data
  ceph: fix osd request lru adjustment when sending request
  ceph: don't improperly set dir complete when holding EXCL cap
  mm: exporting account_page_dirty
  ceph: direct requests in snapped namespace based on nonsnap parent
  ceph: queue cap snap writeback for realm children on snap update
  ceph: include dirty xattrs state in snapped caps
  ceph: fix xattr cap writeback
  ceph: fix multiple mds session shutdown

1  2 
mm/page-writeback.c

diff --combined mm/page-writeback.c
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/syscalls.h>
  #include <linux/buffer_head.h>
  #include <linux/pagevec.h>
 +#include <trace/events/writeback.h>
  
  /*
   * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@@ -253,6 -252,32 +253,6 @@@ static void bdi_writeout_fraction(struc
        }
  }
  
 -/*
 - * Clip the earned share of dirty pages to that which is actually available.
 - * This avoids exceeding the total dirty_limit when the floating averages
 - * fluctuate too quickly.
 - */
 -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 -              unsigned long dirty, unsigned long *pbdi_dirty)
 -{
 -      unsigned long avail_dirty;
 -
 -      avail_dirty = global_page_state(NR_FILE_DIRTY) +
 -               global_page_state(NR_WRITEBACK) +
 -               global_page_state(NR_UNSTABLE_NFS) +
 -               global_page_state(NR_WRITEBACK_TEMP);
 -
 -      if (avail_dirty < dirty)
 -              avail_dirty = dirty - avail_dirty;
 -      else
 -              avail_dirty = 0;
 -
 -      avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
 -              bdi_stat(bdi, BDI_WRITEBACK);
 -
 -      *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
 -}
 -
  static inline void task_dirties_fraction(struct task_struct *tsk,
                long *numerator, long *denominator)
  {
  }
  
  /*
 - * scale the dirty limit
 + * task_dirty_limit - scale down dirty throttling threshold for one task
   *
   * task specific dirty limit:
   *
   *   dirty -= (dirty/8) * p_{t}
 + *
 + * To protect light/slow dirtying tasks from heavier/fast ones, we start
 + * throttling individual tasks before reaching the bdi dirty limit.
 + * Relatively low thresholds will be allocated to heavy dirtiers. So when
 + * dirty pages grow large, heavy dirtiers will be throttled first, which will
 + * effectively curb the growth of dirty pages. Light dirtiers with high enough
 + * dirty threshold may never get throttled.
   */
 -static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 +static unsigned long task_dirty_limit(struct task_struct *tsk,
 +                                     unsigned long bdi_dirty)
  {
        long numerator, denominator;
 -      unsigned long dirty = *pdirty;
 +      unsigned long dirty = bdi_dirty;
        u64 inv = dirty >> 3;
  
        task_dirties_fraction(tsk, &numerator, &denominator);
        do_div(inv, denominator);
  
        dirty -= inv;
 -      if (dirty < *pdirty/2)
 -              dirty = *pdirty/2;
  
 -      *pdirty = dirty;
 +      return max(dirty, bdi_dirty/2);
  }
  
  /*
@@@ -397,16 -416,9 +397,16 @@@ unsigned long determine_dirtyable_memor
        return x + 1;   /* Ensure that we never return 0 */
  }
  
 -void
 -get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 -               unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 +/*
 + * global_dirty_limits - background-writeback and dirty-throttling thresholds
 + *
 + * Calculate the dirty thresholds based on sysctl parameters
 + * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
 + * - vm.dirty_ratio             or  vm.dirty_bytes
 + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
 + * runtime tasks.
 + */
 +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
  {
        unsigned long background;
        unsigned long dirty;
        }
        *pbackground = background;
        *pdirty = dirty;
 +}
  
 -      if (bdi) {
 -              u64 bdi_dirty;
 -              long numerator, denominator;
 +/*
 + * bdi_dirty_limit - @bdi's share of dirty throttling threshold
 + *
 + * Allocate high/low dirty limits to fast/slow devices, in order to prevent
 + * - starving fast devices
 + * - piling up dirty pages (that will take long time to sync) on slow devices
 + *
 + * The bdi's share of dirty limit will be adapting to its throughput and
 + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 + */
 +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 +{
 +      u64 bdi_dirty;
 +      long numerator, denominator;
  
 -              /*
 -               * Calculate this BDI's share of the dirty ratio.
 -               */
 -              bdi_writeout_fraction(bdi, &numerator, &denominator);
 -
 -              bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
 -              bdi_dirty *= numerator;
 -              do_div(bdi_dirty, denominator);
 -              bdi_dirty += (dirty * bdi->min_ratio) / 100;
 -              if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 -                      bdi_dirty = dirty * bdi->max_ratio / 100;
 -
 -              *pbdi_dirty = bdi_dirty;
 -              clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
 -              task_dirty_limit(current, pbdi_dirty);
 -      }
 +      /*
 +       * Calculate this BDI's share of the dirty ratio.
 +       */
 +      bdi_writeout_fraction(bdi, &numerator, &denominator);
 +
 +      bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
 +      bdi_dirty *= numerator;
 +      do_div(bdi_dirty, denominator);
 +
 +      bdi_dirty += (dirty * bdi->min_ratio) / 100;
 +      if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 +              bdi_dirty = dirty * bdi->max_ratio / 100;
 +
 +      return bdi_dirty;
  }
  
  /*
@@@ -488,7 -490,7 +488,7 @@@ static void balance_dirty_pages(struct 
        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long pause = 1;
 -
 +      bool dirty_exceeded = false;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
  
        for (;;) {
                        .range_cyclic   = 1,
                };
  
 -              get_dirty_limits(&background_thresh, &dirty_thresh,
 -                              &bdi_thresh, bdi);
 -
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
                nr_writeback = global_page_state(NR_WRITEBACK);
  
 -              bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
 -              bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 -
 -              if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 -                      break;
 +              global_dirty_limits(&background_thresh, &dirty_thresh);
  
                /*
                 * Throttle it only when the background writeback cannot
                                (background_thresh + dirty_thresh) / 2)
                        break;
  
 -              if (!bdi->dirty_exceeded)
 -                      bdi->dirty_exceeded = 1;
 -
 -              /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 -               * Unstable writes are a feature of certain networked
 -               * filesystems (i.e. NFS) in which data may have been
 -               * written to the server's write cache, but has not yet
 -               * been flushed to permanent storage.
 -               * Only move pages to writeback if this bdi is over its
 -               * threshold otherwise wait until the disk writes catch
 -               * up.
 -               */
 -              if (bdi_nr_reclaimable > bdi_thresh) {
 -                      writeback_inodes_wb(&bdi->wb, &wbc);
 -                      pages_written += write_chunk - wbc.nr_to_write;
 -                      get_dirty_limits(&background_thresh, &dirty_thresh,
 -                                     &bdi_thresh, bdi);
 -              }
 +              bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 +              bdi_thresh = task_dirty_limit(current, bdi_thresh);
  
                /*
                 * In order to avoid the stacked BDI deadlock we need
                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 -              } else if (bdi_nr_reclaimable) {
 +              } else {
                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
  
 -              if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 +              /*
 +               * The bdi thresh is somehow "soft" limit derived from the
 +               * global "hard" limit. The former helps to prevent heavy IO
 +               * bdi or process from holding back light ones; The latter is
 +               * the last resort safeguard.
 +               */
 +              dirty_exceeded =
 +                      (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
 +                      || (nr_reclaimable + nr_writeback >= dirty_thresh);
 +
 +              if (!dirty_exceeded)
                        break;
 -              if (pages_written >= write_chunk)
 -                      break;          /* We've done our duty */
  
 +              if (!bdi->dirty_exceeded)
 +                      bdi->dirty_exceeded = 1;
 +
 +              /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 +               * Unstable writes are a feature of certain networked
 +               * filesystems (i.e. NFS) in which data may have been
 +               * written to the server's write cache, but has not yet
 +               * been flushed to permanent storage.
 +               * Only move pages to writeback if this bdi is over its
 +               * threshold otherwise wait until the disk writes catch
 +               * up.
 +               */
 +              trace_wbc_balance_dirty_start(&wbc, bdi);
 +              if (bdi_nr_reclaimable > bdi_thresh) {
 +                      writeback_inodes_wb(&bdi->wb, &wbc);
 +                      pages_written += write_chunk - wbc.nr_to_write;
 +                      trace_wbc_balance_dirty_written(&wbc, bdi);
 +                      if (pages_written >= write_chunk)
 +                              break;          /* We've done our duty */
 +              }
 +              trace_wbc_balance_dirty_wait(&wbc, bdi);
                __set_current_state(TASK_INTERRUPTIBLE);
                io_schedule_timeout(pause);
  
                        pause = HZ / 10;
        }
  
 -      if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
 -                      bdi->dirty_exceeded)
 +      if (!dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
  
        if (writeback_in_progress(bdi))
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
 -          (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 -                             + global_page_state(NR_UNSTABLE_NFS))
 -                                        > background_thresh)))
 +          (!laptop_mode && (nr_reclaimable > background_thresh)))
                bdi_start_background_writeback(bdi);
  }
  
@@@ -660,7 -659,7 +660,7 @@@ void throttle_vm_writeout(gfp_t gfp_mas
        unsigned long dirty_thresh;
  
          for ( ; ; ) {
 -              get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 +              global_dirty_limits(&background_thresh, &dirty_thresh);
  
                  /*
                   * Boost the allowable dirty threshold a bit for page
@@@ -805,42 -804,6 +805,42 @@@ void __init page_writeback_init(void
        prop_descriptor_init(&vm_dirties, shift);
  }
  
 +/**
 + * tag_pages_for_writeback - tag pages to be written by write_cache_pages
 + * @mapping: address space structure to write
 + * @start: starting page index
 + * @end: ending page index (inclusive)
 + *
 + * This function scans the page range from @start to @end (inclusive) and tags
 + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
 + * that write_cache_pages (or whoever calls this function) will then use
 + * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
 + * used to avoid livelocking of writeback by a process steadily creating new
 + * dirty pages in the file (thus it is important for this function to be quick
 + * so that it can tag pages faster than a dirtying process can create them).
 + */
 +/*
 + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
 + */
 +void tag_pages_for_writeback(struct address_space *mapping,
 +                           pgoff_t start, pgoff_t end)
 +{
 +#define WRITEBACK_TAG_BATCH 4096
 +      unsigned long tagged;
 +
 +      do {
 +              spin_lock_irq(&mapping->tree_lock);
 +              tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
 +                              &start, end, WRITEBACK_TAG_BATCH,
 +                              PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
 +              spin_unlock_irq(&mapping->tree_lock);
 +              WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
 +              cond_resched();
 +              /* We check 'start' to handle wrapping when end == ~0UL */
 +      } while (tagged >= WRITEBACK_TAG_BATCH && start);
 +}
 +EXPORT_SYMBOL(tag_pages_for_writeback);
 +
  /**
   * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
   * @mapping: address space structure to write
   * the call was made get new I/O started against them.  If wbc->sync_mode is
   * WB_SYNC_ALL then we were called for data integrity and we must wait for
   * existing IO to complete.
 + *
 + * To avoid livelocks (when other process dirties new pages), we first tag
 + * pages which should be written back with TOWRITE tag and only then start
 + * writing them. For data-integrity sync we have to be careful so that we do
 + * not miss some pages (e.g., because some other process has cleared TOWRITE
 + * tag we set). The rule we follow is that TOWRITE tag can be cleared only
 + * by the process clearing the DIRTY tag (and submitting the page for IO).
   */
  int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
        pgoff_t done_index;
        int cycled;
        int range_whole = 0;
 +      int tag;
  
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
 -
 -              /*
 -               * If this is a data integrity sync, cap the writeback to the
 -               * current end of file. Any extension to the file that occurs
 -               * after this is a new write and we don't need to write those
 -               * pages out to fulfil our data integrity requirements. If we
 -               * try to write them out, we can get stuck in this scan until
 -               * the concurrent writer stops adding dirty pages and extending
 -               * EOF.
 -               */
 -              if (wbc->sync_mode == WB_SYNC_ALL &&
 -                  wbc->range_end == LLONG_MAX) {
 -                      end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
 -              }
        }
 -
 +      if (wbc->sync_mode == WB_SYNC_ALL)
 +              tag = PAGECACHE_TAG_TOWRITE;
 +      else
 +              tag = PAGECACHE_TAG_DIRTY;
  retry:
 +      if (wbc->sync_mode == WB_SYNC_ALL)
 +              tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
                int i;
  
 -              nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 -                            PAGECACHE_TAG_DIRTY,
 +              nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@@ -964,7 -929,6 +964,7 @@@ continue_unlock
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
  
 +                      trace_wbc_writepage(wbc, mapping->backing_dev_info);
                        ret = (*writepage)(page, wbc, data);
                        if (unlikely(ret)) {
                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
                                }
                        }
  
 -                      if (wbc->nr_to_write > 0) {
 -                              if (--wbc->nr_to_write == 0 &&
 -                                  wbc->sync_mode == WB_SYNC_NONE) {
 -                                      /*
 -                                       * We stop writing back only if we are
 -                                       * not doing integrity sync. In case of
 -                                       * integrity sync we have to keep going
 -                                       * because someone may be concurrently
 -                                       * dirtying pages, and we might have
 -                                       * synced a lot of newly appeared dirty
 -                                       * pages, but have not synced all of the
 -                                       * old dirty pages.
 -                                       */
 -                                      done = 1;
 -                                      break;
 -                              }
 +                      /*
 +                       * We stop writing back only if we are not doing
 +                       * integrity sync. In case of integrity sync we have to
 +                       * keep going until we have written all the pages
 +                       * we tagged for writeback prior to entering this loop.
 +                       */
 +                      if (--wbc->nr_to_write <= 0 &&
 +                          wbc->sync_mode == WB_SYNC_NONE) {
 +                              done = 1;
 +                              break;
                        }
                }
                pagevec_release(&pvec);
@@@ -1126,6 -1096,7 +1126,7 @@@ void account_page_dirtied(struct page *
                task_io_account_write(PAGE_CACHE_SIZE);
        }
  }
+ EXPORT_SYMBOL(account_page_dirtied);
  
  /*
   * For address_spaces which do not use buffers.  Just tag the page as dirty in
@@@ -1357,9 -1328,6 +1358,9 @@@ int test_set_page_writeback(struct pag
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
 +              radix_tree_tag_clear(&mapping->page_tree,
 +                                   page_index(page),
 +                                   PAGECACHE_TAG_TOWRITE);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);