Merge branch 'for-2.6.36' of git://git.kernel.dk/linux-2.6-block
[pandora-kernel.git] / fs / fs-writeback.c
index b7c7586..2f76c4a 100644 (file)
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/tracepoint.h>
 #include "internal.h"
 
-#define inode_to_bdi(inode)    ((inode)->i_mapping->backing_dev_info)
-
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -50,6 +44,21 @@ struct wb_writeback_work {
        struct completion *done;        /* set if the caller waits */
 };
 
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure so that the definition remains local to this
+ * file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+#define inode_to_bdi(inode)    ((inode)->i_mapping->backing_dev_info)
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -65,22 +74,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
-       spin_lock(&bdi->wb_lock);
-       list_add_tail(&work->list, &bdi->work_list);
-       spin_unlock(&bdi->wb_lock);
+       trace_writeback_queue(bdi, work);
 
-       /*
-        * If the default thread isn't there, make sure we add it. When
-        * it gets created and wakes up, we'll run this work.
-        */
-       if (unlikely(list_empty_careful(&bdi->wb_list)))
+       spin_lock_bh(&bdi->wb_lock);
+       list_add_tail(&work->list, &bdi->work_list);
+       if (bdi->wb.task) {
+               wake_up_process(bdi->wb.task);
+       } else {
+               /*
+                * The bdi thread isn't there, wake up the forker thread which
+                * will create and run it.
+                */
+               trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
-       else {
-               struct bdi_writeback *wb = &bdi->wb;
-
-               if (wb->task)
-                       wake_up_process(wb->task);
        }
+       spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
@@ -95,8 +103,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               if (bdi->wb.task)
+               if (bdi->wb.task) {
+                       trace_writeback_nowork(bdi);
                        wake_up_process(bdi->wb.task);
+               }
                return;
        }
 
@@ -643,10 +653,14 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.more_io = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
+
+               trace_wbc_writeback_start(&wbc, wb->bdi);
                if (work->sb)
                        __writeback_inodes_sb(work->sb, wb, &wbc);
                else
                        writeback_inodes_wb(wb, &wbc);
+               trace_wbc_writeback_written(&wbc, wb->bdi);
+
                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -674,6 +688,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (!list_empty(&wb->b_more_io))  {
                        inode = list_entry(wb->b_more_io.prev,
                                                struct inode, i_list);
+                       trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
                spin_unlock(&inode_lock);
@@ -686,17 +701,17 @@ static long wb_writeback(struct bdi_writeback *wb,
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
 static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+get_next_work_item(struct backing_dev_info *bdi)
 {
        struct wb_writeback_work *work = NULL;
 
-       spin_lock(&bdi->wb_lock);
+       spin_lock_bh(&bdi->wb_lock);
        if (!list_empty(&bdi->work_list)) {
                work = list_entry(bdi->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
-       spin_unlock(&bdi->wb_lock);
+       spin_unlock_bh(&bdi->wb_lock);
        return work;
 }
 
@@ -744,7 +759,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        struct wb_writeback_work *work;
        long wrote = 0;
 
-       while ((work = get_next_work_item(bdi, wb)) != NULL) {
+       while ((work = get_next_work_item(bdi)) != NULL) {
                /*
                 * Override sync mode, in case we must wait for completion
                 * because this thread is exiting now.
@@ -752,6 +767,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->sync_mode = WB_SYNC_ALL;
 
+               trace_writeback_exec(bdi, work);
+
                wrote += wb_writeback(wb, work);
 
                /*
@@ -776,47 +793,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * wakes up periodically and does kupdated style flushing.
  */
-int bdi_writeback_task(struct bdi_writeback *wb)
+int bdi_writeback_thread(void *data)
 {
-       unsigned long last_active = jiffies;
-       unsigned long wait_jiffies = -1UL;
+       struct bdi_writeback *wb = data;
+       struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
 
+       current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+       set_freezable();
+       wb->last_active = jiffies;
+
+       /*
+        * Our parent may run at a different priority, just set us to normal
+        */
+       set_user_nice(current, 0);
+
+       trace_writeback_thread_start(bdi);
+
        while (!kthread_should_stop()) {
+               /*
+                * Remove own delayed wake-up timer, since we are already awake
+                * and we'll take care of the preriodic write-back.
+                */
+               del_timer(&wb->wakeup_timer);
+
                pages_written = wb_do_writeback(wb, 0);
 
+               trace_writeback_pages_written(pages_written);
+
                if (pages_written)
-                       last_active = jiffies;
-               else if (wait_jiffies != -1UL) {
-                       unsigned long max_idle;
+                       wb->last_active = jiffies;
 
-                       /*
-                        * Longest period of inactivity that we tolerate. If we
-                        * see dirty data again later, the task will get
-                        * recreated automatically.
-                        */
-                       max_idle = max(5UL * 60 * HZ, wait_jiffies);
-                       if (time_after(jiffies, max_idle + last_active))
-                               break;
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!list_empty(&bdi->work_list)) {
+                       __set_current_state(TASK_RUNNING);
+                       continue;
                }
 
-               if (dirty_writeback_interval) {
-                       wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-                       schedule_timeout_interruptible(wait_jiffies);
-               } else {
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       if (list_empty_careful(&wb->bdi->work_list) &&
-                           !kthread_should_stop())
-                               schedule();
-                       __set_current_state(TASK_RUNNING);
+               if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+                       schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+               else {
+                       /*
+                        * We have nothing to do, so can go sleep without any
+                        * timeout and save power. When a work is queued or
+                        * something is made dirty - we will be woken up.
+                        */
+                       schedule();
                }
 
                try_to_freeze();
        }
 
+       /* Flush any work that raced with us exiting */
+       if (!list_empty(&bdi->work_list))
+               wb_do_writeback(wb, 1);
+
+       trace_writeback_thread_stop(bdi);
        return 0;
 }
 
+
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
@@ -891,6 +927,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
+       struct backing_dev_info *bdi = NULL;
+       bool wakeup_bdi = false;
 
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -944,22 +982,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
-                       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-                       struct backing_dev_info *bdi = wb->bdi;
-
-                       if (bdi_cap_writeback_dirty(bdi) &&
-                           !test_bit(BDI_registered, &bdi->state)) {
-                               WARN_ON(1);
-                               printk(KERN_ERR "bdi-%s not registered\n",
-                                                               bdi->name);
+                       bdi = inode_to_bdi(inode);
+
+                       if (bdi_cap_writeback_dirty(bdi)) {
+                               WARN(!test_bit(BDI_registered, &bdi->state),
+                                    "bdi-%s not registered\n", bdi->name);
+
+                               /*
+                                * If this is the first dirty inode for this
+                                * bdi, we have to wake-up the corresponding
+                                * bdi thread to make sure background
+                                * write-back happens later.
+                                */
+                               if (!wb_has_dirty_io(&bdi->wb))
+                                       wakeup_bdi = true;
                        }
 
                        inode->dirtied_when = jiffies;
-                       list_move(&inode->i_list, &wb->b_dirty);
+                       list_move(&inode->i_list, &bdi->wb.b_dirty);
                }
        }
 out:
        spin_unlock(&inode_lock);
+
+       if (wakeup_bdi)
+               bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);