Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec...
[pandora-kernel.git] / fs / ocfs2 / cluster / heartbeat.c
index 9f26ac9..a6cc053 100644 (file)
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES  4
 #define O2HB_DB_TYPE_REGION_NUMBER     5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME       6
+#define O2HB_DB_TYPE_REGION_PINNED     7
 struct o2hb_debug_buf {
        int db_type;
        int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS       "failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER       "num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED       "pinned"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF               3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
 /* Only sets a new threshold if there are no active regions.
  *
  * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
 
        struct list_head        hr_all_item;
-       unsigned                hr_unclean_stop:1;
+       unsigned                hr_unclean_stop:1,
+                               hr_item_pinned:1,
+                               hr_item_dropped:1;
 
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
        struct dentry           *hr_debug_livenodes;
        struct dentry           *hr_debug_regnum;
        struct dentry           *hr_debug_elapsed_time;
+       struct dentry           *hr_debug_pinned;
        struct o2hb_debug_buf   *hr_db_livenodes;
        struct o2hb_debug_buf   *hr_db_regnum;
        struct o2hb_debug_buf   *hr_db_elapsed_time;
+       struct o2hb_debug_buf   *hr_db_pinned;
 
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
@@ -307,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-       cancel_delayed_work(&reg->hr_write_timeout_work);
-       flush_scheduled_work();
+       cancel_delayed_work_sync(&reg->hr_write_timeout_work);
 }
 
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -702,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
               config_item_name(&reg->hr_item));
 
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+       /*
+        * If global heartbeat active, unpin all regions if the
+        * region count > CUT_OFF
+        */
+       if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                          O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+               o2hb_region_unpin(NULL);
 }
 
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1042,6 +1082,9 @@ static int o2hb_thread(void *data)
 
        set_user_nice(current, -20);
 
+       /* Pin node */
+       o2nm_depend_this_node();
+
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1091,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
 
+       /* Unpin node */
+       o2nm_undepend_this_node();
+
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
 
        return 0;
@@ -1143,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
                                                 reg->hr_last_timeout_start));
                goto done;
 
+       case O2HB_DB_TYPE_REGION_PINNED:
+               reg = (struct o2hb_region *)db->db_data;
+               out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                               !!reg->hr_item_pinned);
+               goto done;
+
        default:
                goto done;
        }
@@ -1316,6 +1368,8 @@ int o2hb_init(void)
        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
 
+       o2hb_dependent_users = 0;
+
        return o2hb_debug_init();
 }
 
@@ -1385,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
+       debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
 
        spin_lock(&o2hb_live_lock);
@@ -1949,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                goto bail;
        }
 
+       reg->hr_debug_pinned =
+                       o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                         reg->hr_debug_dir,
+                                         &(reg->hr_db_pinned),
+                                         sizeof(*(reg->hr_db_pinned)),
+                                         O2HB_DB_TYPE_REGION_PINNED,
+                                         0, 0, reg);
+       if (!reg->hr_debug_pinned) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
        ret = 0;
 bail:
        return ret;
@@ -2003,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+       int quorum_region = 0;
 
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
        if (o2hb_global_heartbeat_active()) {
                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+               if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                       quorum_region = 1;
+               clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+       reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
 
        if (hb_task)
@@ -2029,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (o2hb_global_heartbeat_active())
                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
                       config_item_name(&reg->hr_item));
+
        config_item_put(item);
+
+       if (!o2hb_global_heartbeat_active() || !quorum_region)
+               return;
+
+       /*
+        * If global heartbeat active and there are dependent users,
+        * pin all regions if quorum region count <= CUT_OFF
+        */
+       spin_lock(&o2hb_live_lock);
+
+       if (!o2hb_dependent_users)
+               goto unlock;
+
+       if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                          O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+               o2hb_region_pin(NULL);
+
+unlock:
+       spin_unlock(&o2hb_live_lock);
 }
 
 struct o2hb_heartbeat_group_attribute {
@@ -2215,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
 
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-       struct o2hb_region *p, *reg = NULL;
+       int ret = 0, found = 0;
+       struct o2hb_region *reg;
+       char *uuid;
 
        assert_spin_locked(&o2hb_live_lock);
 
-       list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
-               if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
-                       reg = p;
-                       break;
+       list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+               uuid = config_item_name(&reg->hr_item);
+
+               /* local heartbeat */
+               if (region_uuid) {
+                       if (strcmp(region_uuid, uuid))
+                               continue;
+                       found = 1;
+               }
+
+               if (reg->hr_item_pinned || reg->hr_item_dropped)
+                       goto skip_pin;
+
+               /* Ignore ENOENT only for local hb (userdlm domain) */
+               ret = o2nm_depend_item(&reg->hr_item);
+               if (!ret) {
+                       mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                       reg->hr_item_pinned = 1;
+               } else {
+                       if (ret == -ENOENT && found)
+                               ret = 0;
+                       else {
+                               mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                    uuid, ret);
+                               break;
+                       }
                }
+skip_pin:
+               if (found)
+                       break;
        }
 
-       return reg;
+       return ret;
 }
 
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-       int ret = 0;
        struct o2hb_region *reg;
+       char *uuid;
+       int found = 0;
 
-       spin_lock(&o2hb_live_lock);
+       assert_spin_locked(&o2hb_live_lock);
 
-       reg = o2hb_find_region(region_uuid);
-       if (!reg)
-               ret = -ENOENT;
-       spin_unlock(&o2hb_live_lock);
+       list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+               uuid = config_item_name(&reg->hr_item);
+               if (region_uuid) {
+                       if (strcmp(region_uuid, uuid))
+                               continue;
+                       found = 1;
+               }
 
-       if (ret)
-               goto out;
+               if (reg->hr_item_pinned) {
+                       mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                       o2nm_undepend_item(&reg->hr_item);
+                       reg->hr_item_pinned = 0;
+               }
+               if (found)
+                       break;
+       }
+}
 
-       ret = o2nm_depend_this_node();
-       if (ret)
-               goto out;
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+       int ret = 0;
 
-       ret = o2nm_depend_item(&reg->hr_item);
-       if (ret)
-               o2nm_undepend_this_node();
+       spin_lock(&o2hb_live_lock);
 
-out:
+       /* local heartbeat */
+       if (!o2hb_global_heartbeat_active()) {
+           ret = o2hb_region_pin(region_uuid);
+           goto unlock;
+       }
+
+       /*
+        * if global heartbeat active and this is the first dependent user,
+        * pin all regions if quorum region count <= CUT_OFF
+        */
+       o2hb_dependent_users++;
+       if (o2hb_dependent_users > 1)
+               goto unlock;
+
+       if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                          O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+               ret = o2hb_region_pin(NULL);
+
+unlock:
+       spin_unlock(&o2hb_live_lock);
        return ret;
 }
 
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-       struct o2hb_region *reg;
-
        spin_lock(&o2hb_live_lock);
 
-       reg = o2hb_find_region(region_uuid);
+       /* local heartbeat */
+       if (!o2hb_global_heartbeat_active()) {
+           o2hb_region_unpin(region_uuid);
+           goto unlock;
+       }
 
-       spin_unlock(&o2hb_live_lock);
+       /*
+        * if global heartbeat active and there are no dependent users,
+        * unpin all quorum regions
+        */
+       o2hb_dependent_users--;
+       if (!o2hb_dependent_users)
+               o2hb_region_unpin(NULL);
 
-       if (reg) {
-               o2nm_undepend_item(&reg->hr_item);
-               o2nm_undepend_this_node();
-       }
+unlock:
+       spin_unlock(&o2hb_live_lock);
 }
 
 int o2hb_register_callback(const char *region_uuid,
@@ -2292,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
 
        if (region_uuid) {
-               ret = o2hb_region_get(region_uuid);
-               if (ret)
+               ret = o2hb_region_inc_user(region_uuid);
+               if (ret) {
+                       mlog_errno(ret);
                        goto out;
+               }
        }
 
        down_write(&o2hb_callback_sem);
@@ -2312,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-       mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+       mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -2323,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
 
-       mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+       mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
 
        /* XXX Can this happen _with_ a region reference? */
@@ -2331,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
 
        if (region_uuid)
-               o2hb_region_put(region_uuid);
+               o2hb_region_dec_user(region_uuid);
 
        down_write(&o2hb_callback_sem);