Btrfs: fix deadlock on async thread startup
authorChris Mason <chris.mason@oracle.com>
Fri, 2 Oct 2009 23:11:56 +0000 (19:11 -0400)
committerChris Mason <chris.mason@oracle.com>
Mon, 5 Oct 2009 13:44:45 +0000 (09:44 -0400)
The btrfs async worker threads are used for a wide variety of things,
including processing bio end_io functions.  This means that when
the endio threads aren't running, the rest of the FS isn't
able to do the final processing required to clear PageWriteback.

The endio threads also try to exit as they become idle and
start more as the work piles up.  The problem is that starting more
threads means kthreadd may need to allocate ram, and that allocation
may wait until the global number of writeback pages on the system is
below a certain limit.

The result of that throttling is that end IO threads wait on
kthreadd, who is waiting on IO to end, which will never happen.

This commit fixes the deadlock by handing off thread startup to a
dedicated thread.  It also fixes a bug where the on-demand thread
creation was creating far too many threads because it didn't take into
account threads being started by other procs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/relocation.c

index 282ca08..c0861e7 100644 (file)
@@ -63,6 +63,51 @@ struct btrfs_worker_thread {
        int idle;
 };
 
+/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time.  It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations.  This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+       struct btrfs_work work;
+       struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+       struct worker_start *start;
+       start = container_of(work, struct worker_start, work);
+       btrfs_start_workers(start->queue, 1);
+       kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+       struct worker_start *start;
+       int ret;
+
+       start = kzalloc(sizeof(*start), GFP_NOFS);
+       if (!start)
+               return -ENOMEM;
+
+       start->work.func = start_new_worker_func;
+       start->queue = queue;
+       ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+       if (ret)
+               kfree(start);
+       return ret;
+}
+
 /*
  * helper function to move a thread onto the idle list after it
  * has finished some requests.
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
                goto out;
 
        workers->atomic_start_pending = 0;
-       if (workers->num_workers >= workers->max_workers)
+       if (workers->num_workers + workers->num_workers_starting >=
+           workers->max_workers)
                goto out;
 
+       workers->num_workers_starting += 1;
        spin_unlock_irqrestore(&workers->lock, flags);
-       btrfs_start_workers(workers, 1);
+       start_new_worker(workers);
        return;
 
 out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 /*
  * simple init on struct btrfs_workers
  */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                       struct btrfs_workers *async_helper)
 {
        workers->num_workers = 0;
+       workers->num_workers_starting = 0;
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        workers->name = name;
        workers->ordered = 0;
        workers->atomic_start_pending = 0;
-       workers->atomic_worker_start = 0;
+       workers->atomic_worker_start = async_helper;
 }
 
 /*
  * starts new worker threads.  This does not enforce the max worker
  * count in case you need to temporarily go past it.
  */
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+                                int num_workers)
 {
        struct btrfs_worker_thread *worker;
        int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                list_add_tail(&worker->worker_list, &workers->idle_list);
                worker->idle = 1;
                workers->num_workers++;
+               workers->num_workers_starting--;
+               WARN_ON(workers->num_workers_starting < 0);
                spin_unlock_irq(&workers->lock);
        }
        return 0;
@@ -452,6 +504,14 @@ fail:
        return ret;
 }
 
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+       spin_lock_irq(&workers->lock);
+       workers->num_workers_starting += num_workers;
+       spin_unlock_irq(&workers->lock);
+       return __btrfs_start_workers(workers, num_workers);
+}
+
 /*
  * run through the list and find a worker thread that doesn't have a lot
  * to do right now.  This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
        struct btrfs_worker_thread *worker;
        struct list_head *next;
-       int enforce_min = workers->num_workers < workers->max_workers;
+       int enforce_min;
+
+       enforce_min = (workers->num_workers + workers->num_workers_starting) <
+               workers->max_workers;
 
        /*
         * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
        worker = next_worker(workers);
 
        if (!worker) {
-               if (workers->num_workers >= workers->max_workers) {
+               if (workers->num_workers + workers->num_workers_starting >=
+                   workers->max_workers) {
                        goto fallback;
                } else if (workers->atomic_worker_start) {
                        workers->atomic_start_pending = 1;
                        goto fallback;
                } else {
+                       workers->num_workers_starting++;
                        spin_unlock_irqrestore(&workers->lock, flags);
                        /* we're below the limit, start another worker */
-                       btrfs_start_workers(workers, 1);
+                       __btrfs_start_workers(workers, 1);
                        goto again;
                }
        }
index fc089b9..5077746 100644 (file)
@@ -64,6 +64,8 @@ struct btrfs_workers {
        /* current number of running workers */
        int num_workers;
 
+       int num_workers_starting;
+
        /* max number of workers allowed.  changed by btrfs_start_workers */
        int max_workers;
 
@@ -78,9 +80,10 @@ struct btrfs_workers {
 
        /*
         * are we allowed to sleep while starting workers or are we required
-        * to start them at a later time?
+        * to start them at a later time?  If we can't sleep, this indicates
+        * which queue we need to use to schedule thread creation.
         */
-       int atomic_worker_start;
+       struct btrfs_workers *atomic_worker_start;
 
        /* list with all the work threads.  The workers on the idle thread
         * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                       struct btrfs_workers *async_starter);
 int btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
index 8184f2f..1b920ff 100644 (file)
@@ -907,6 +907,7 @@ struct btrfs_fs_info {
         * A third pool does submit_bio to avoid deadlocking with the other
         * two
         */
+       struct btrfs_workers generic_worker;
        struct btrfs_workers workers;
        struct btrfs_workers delalloc_workers;
        struct btrfs_workers endio_workers;
index 69dce50..9903f04 100644 (file)
@@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -EINVAL;
                goto fail_iput;
        }
-printk("thread pool is %d\n", fs_info->thread_pool_size);
-       /*
-        * we need to start all the end_io workers up front because the
-        * queue work function gets called at interrupt time, and so it
-        * cannot dynamically grow.
-        */
+
+       btrfs_init_workers(&fs_info->generic_worker,
+                          "genwork", 1, NULL);
+
        btrfs_init_workers(&fs_info->workers, "worker",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
-                          fs_info->thread_pool_size));
+                          fs_info->thread_pool_size),
+                          &fs_info->generic_worker);
 
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
        fs_info->delalloc_workers.idle_thresh = 2;
        fs_info->delalloc_workers.ordered = 1;
 
-       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_workers, "endio",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_meta_write_workers,
-                          "endio-meta-write", fs_info->thread_pool_size);
+                          "endio-meta-write", fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        /*
         * endios are largely parallel and should have a very
@@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
 
-       fs_info->endio_workers.atomic_worker_start = 1;
-       fs_info->endio_meta_workers.atomic_worker_start = 1;
-       fs_info->endio_write_workers.atomic_worker_start = 1;
-       fs_info->endio_meta_write_workers.atomic_worker_start = 1;
-
        btrfs_start_workers(&fs_info->workers, 1);
+       btrfs_start_workers(&fs_info->generic_worker, 1);
        btrfs_start_workers(&fs_info->submit_workers, 1);
        btrfs_start_workers(&fs_info->delalloc_workers, 1);
        btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -2012,6 +2014,7 @@ fail_chunk_root:
        free_extent_buffer(chunk_root->node);
        free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
+       btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
        btrfs_stop_workers(&fs_info->workers);
@@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root)
 
        iput(fs_info->btree_inode);
 
+       btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
        btrfs_stop_workers(&fs_info->workers);
index 361ad32..cfcc93c 100644 (file)
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        BUG_ON(!rc->block_group);
 
        btrfs_init_workers(&rc->workers, "relocate",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size, NULL);
 
        rc->extent_root = extent_root;
        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        mapping_tree_init(&rc->reloc_root_tree);
        INIT_LIST_HEAD(&rc->reloc_roots);
        btrfs_init_workers(&rc->workers, "relocate",
-                          root->fs_info->thread_pool_size);
+                          root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
 
        set_reloc_control(rc);