Merge branch 'move_extents' of git://oss.oracle.com/git/tye/linux-2.6 into ocfs2...
authorJoel Becker <jlbec@evilplan.org>
Thu, 26 May 2011 04:51:55 +0000 (21:51 -0700)
committerJoel Becker <jlbec@evilplan.org>
Thu, 26 May 2011 04:51:55 +0000 (21:51 -0700)
Conflicts:
fs/ocfs2/ioctl.c

16 files changed:
Documentation/ABI/removed/o2cb [moved from Documentation/ABI/obsolete/o2cb with 65% similarity]
Documentation/feature-removal-schedule.txt
Documentation/filesystems/ocfs2.txt
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/cluster/sys.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/file.c
fs/ocfs2/ioctl.c
fs/ocfs2/ocfs2_trace.h
fs/ocfs2/super.c

similarity index 65%
rename from Documentation/ABI/obsolete/o2cb
rename to Documentation/ABI/removed/o2cb
index 9c49d8e..7f5daa4 100644 (file)
@@ -1,11 +1,10 @@
 What:          /sys/o2cb symlink
-Date:          Dec 2005
-KernelVersion: 2.6.16
+Date:          May 2011
+KernelVersion: 2.6.40
 Contact:       ocfs2-devel@oss.oracle.com
-Description:   This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
-               be removed when new versions of ocfs2-tools which know to look
+Description:   This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
+               removed when new versions of ocfs2-tools which know to look
                in /sys/fs/o2cb are sufficiently prevalent. Don't code new
                software to look here, it should try /sys/fs/o2cb instead.
-               See Documentation/ABI/stable/o2cb for more information on usage.
 Users:         ocfs2-tools. It's sufficient to mail proposed changes to
                ocfs2-devel@oss.oracle.com.
index 95788ad..ff31b1c 100644 (file)
@@ -262,16 +262,6 @@ Who:       Michael Buesch <mb@bu3sch.de>
 
 ---------------------------
 
-What:  /sys/o2cb symlink
-When:  January 2010
-Why:   /sys/fs/o2cb is the proper location for this information - /sys/o2cb
-       exists as a symlink for backwards compatibility for old versions of
-       ocfs2-tools. 2 years should be sufficient time to phase in new versions
-       which know to look in /sys/fs/o2cb.
-Who:   ocfs2-devel@oss.oracle.com
-
----------------------------
-
 What:  Ability for non root users to shm_get hugetlb pages based on mlock
        resource limits
 When:  2.6.31
index 9ed920a..7618a28 100644 (file)
@@ -46,9 +46,15 @@ errors=panic         Panic and halt the machine if an error occurs.
 intr           (*)     Allow signals to interrupt cluster operations.
 nointr                 Do not allow signals to interrupt cluster
                        operations.
+noatime                        Do not update access time.
+relatime(*)            Update atime if the previous atime is older than
+                       mtime or ctime
+strictatime            Always update atime, but the minimum update interval
+                       is specified by atime_quantum.
 atime_quantum=60(*)    OCFS2 will not update atime unless this number
                        of seconds has passed since the last update.
-                       Set to zero to always update atime.
+                       Set to zero to always update atime. This option need
+                       work with strictatime.
 data=ordered   (*)     All data are forced directly out to the main file
                        system prior to its metadata being committed to the
                        journal.
index 48aa9c7..ed553c6 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7184,3 +7185,168 @@ out_commit:
 out:
        return ret;
 }
+
+static int ocfs2_trim_extent(struct super_block *sb,
+                            struct ocfs2_group_desc *gd,
+                            u32 start, u32 count)
+{
+       u64 discard, bcount;
+
+       bcount = ocfs2_clusters_to_blocks(sb, count);
+       discard = le64_to_cpu(gd->bg_blkno) +
+                       ocfs2_clusters_to_blocks(sb, start);
+
+       trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+       return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+                           struct ocfs2_group_desc *gd,
+                           u32 start, u32 max, u32 minbits)
+{
+       int ret = 0, count = 0, next;
+       void *bitmap = gd->bg_bitmap;
+
+       if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+               return 0;
+
+       trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+                              start, max, minbits);
+
+       while (start < max) {
+               start = ocfs2_find_next_zero_bit(bitmap, max, start);
+               if (start >= max)
+                       break;
+               next = ocfs2_find_next_bit(bitmap, max, start);
+
+               if ((next - start) >= minbits) {
+                       ret = ocfs2_trim_extent(sb, gd,
+                                               start, next - start);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               break;
+                       }
+                       count += next - start;
+               }
+               start = next + 1;
+
+               if (fatal_signal_pending(current)) {
+                       count = -ERESTARTSYS;
+                       break;
+               }
+
+               if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+                       break;
+       }
+
+       if (ret < 0)
+               count = ret;
+
+       return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+       struct ocfs2_super *osb = OCFS2_SB(sb);
+       u64 start, len, trimmed, first_group, last_group, group;
+       int ret, cnt;
+       u32 first_bit, last_bit, minlen;
+       struct buffer_head *main_bm_bh = NULL;
+       struct inode *main_bm_inode = NULL;
+       struct buffer_head *gd_bh = NULL;
+       struct ocfs2_dinode *main_bm;
+       struct ocfs2_group_desc *gd = NULL;
+
+       start = range->start >> osb->s_clustersize_bits;
+       len = range->len >> osb->s_clustersize_bits;
+       minlen = range->minlen >> osb->s_clustersize_bits;
+       trimmed = 0;
+
+       if (!len) {
+               range->len = 0;
+               return 0;
+       }
+
+       if (minlen >= osb->bitmap_cpg)
+               return -EINVAL;
+
+       main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                   GLOBAL_BITMAP_SYSTEM_INODE,
+                                                   OCFS2_INVALID_SLOT);
+       if (!main_bm_inode) {
+               ret = -EIO;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       mutex_lock(&main_bm_inode->i_mutex);
+
+       ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_mutex;
+       }
+       main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+       if (start >= le32_to_cpu(main_bm->i_clusters)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       if (start + len > le32_to_cpu(main_bm->i_clusters))
+               len = le32_to_cpu(main_bm->i_clusters) - start;
+
+       trace_ocfs2_trim_fs(start, len, minlen);
+
+       /* Determine first and last group to examine based on start and len */
+       first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+       if (first_group == osb->first_cluster_group_blkno)
+               first_bit = start;
+       else
+               first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+       last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+       last_bit = osb->bitmap_cpg;
+
+       for (group = first_group; group <= last_group;) {
+               if (first_bit + len >= osb->bitmap_cpg)
+                       last_bit = osb->bitmap_cpg;
+               else
+                       last_bit = first_bit + len;
+
+               ret = ocfs2_read_group_descriptor(main_bm_inode,
+                                                 main_bm, group,
+                                                 &gd_bh);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       break;
+               }
+
+               gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+               cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+               brelse(gd_bh);
+               gd_bh = NULL;
+               if (cnt < 0) {
+                       ret = cnt;
+                       mlog_errno(ret);
+                       break;
+               }
+
+               trimmed += cnt;
+               len -= osb->bitmap_cpg - first_bit;
+               first_bit = 0;
+               if (group == osb->first_cluster_group_blkno)
+                       group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+               else
+                       group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+       }
+       range->len = trimmed * sb->s_blocksize;
+out_unlock:
+       ocfs2_inode_unlock(main_bm_inode, 0);
+       brelse(main_bm_bh);
+out_mutex:
+       mutex_unlock(&main_bm_inode->i_mutex);
+       iput(main_bm_inode);
+out:
+       return ret;
+}
index 3bd08a0..ca381c5 100644 (file)
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
                    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
index bc702da..a4b0773 100644 (file)
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-       sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
 
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
 
-       /*
-        * Create this symlink for backwards compatibility with old
-        * versions of ocfs2-tools which look for things in /sys/o2cb.
-        */
-       ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
-       if (ret)
-               goto error;
-
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
index 4bdf7ba..d602abb 100644 (file)
@@ -144,6 +144,7 @@ struct dlm_ctxt
        wait_queue_head_t dlm_join_events;
        unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
        return 1;
 }
 
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+       if (idx == DLM_GRANTED_LIST)
+               return "granted";
+       else if (idx == DLM_CONVERTING_LIST)
+               return "converting";
+       else if (idx == DLM_BLOCKED_LIST)
+               return "blocked";
+       else
+               return "unknown";
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -448,6 +461,7 @@ enum {
        DLM_FINALIZE_RECO_MSG           = 518,
        DLM_QUERY_REGION                = 519,
        DLM_QUERY_NODEINFO              = 520,
+       DLM_BEGIN_EXIT_DOMAIN_MSG       = 521,
 };
 
 struct dlm_reco_node_data
index 04a32be..56f82cb 100644 (file)
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
                                 buf + out, len - out);
        out += snprintf(buf + out, len - out, "\n");
 
+       /* Exit Domain Map: xx xx xx */
+       out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+       out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
        /* Live Map: xx xx xx */
        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
index 3b179d6..6ed6b95 100644 (file)
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * New in version 1.1:
  *     - Message DLM_QUERY_REGION added to support global heartbeat
  *     - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ *     - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
  */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-       .pv_minor = 1,
+       .pv_minor = 2,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
                        dropped = dlm_empty_lockres(dlm, res);
 
                        spin_lock(&res->spinlock);
-                       __dlm_lockres_calc_usage(dlm, res);
-                       iter = res->hash_node.next;
+                       if (dropped)
+                               __dlm_lockres_calc_usage(dlm, res);
+                       else
+                               iter = res->hash_node.next;
                        spin_unlock(&res->spinlock);
 
                        dlm_lockres_put(res);
 
-                       if (dropped)
+                       if (dropped) {
+                               cond_resched_lock(&dlm->spinlock);
                                goto redo_bucket;
+                       }
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
        return ret;
 }
 
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+                                        void *data, void **ret_data)
+{
+       struct dlm_ctxt *dlm = data;
+       unsigned int node;
+       struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+       if (!dlm_grab(dlm))
+               return 0;
+
+       node = exit_msg->node_idx;
+       mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+       spin_lock(&dlm->spinlock);
+       set_bit(node, dlm->exit_domain_map);
+       spin_unlock(&dlm->spinlock);
+
+       dlm_put(dlm);
+
+       return 0;
+}
+
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
        /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
+       clear_bit(node, dlm->exit_domain_map);
        __dlm_print_nodes(dlm);
 
        /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        return 0;
 }
 
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
                                    unsigned int node)
 {
        int status;
        struct dlm_exit_domain leave_msg;
 
-       mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
-                 node, dlm->name, dlm->node_num);
+       mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+            msg_type, node);
 
        memset(&leave_msg, 0, sizeof(leave_msg));
        leave_msg.node_idx = dlm->node_num;
 
-       status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
-                                   &leave_msg, sizeof(leave_msg), node,
-                                   NULL);
+       status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+                                   sizeof(leave_msg), node, NULL);
        if (status < 0)
-               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-                    "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
-       mlog(0, "status return %d from o2net_send_message\n", status);
+               mlog(ML_ERROR, "Error %d sending domain exit message %u "
+                    "to node %u on domain %s\n", status, msg_type, node,
+                    dlm->name);
 
        return status;
 }
 
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+       int node = -1;
+
+       /* Support for begin exit domain was added in 1.2 */
+       if (dlm->dlm_locking_proto.pv_major == 1 &&
+           dlm->dlm_locking_proto.pv_minor < 2)
+               return;
+
+       /*
+        * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+        * informational. Meaning if a node does not receive the message,
+        * so be it.
+        */
+       spin_lock(&dlm->spinlock);
+       while (1) {
+               node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+               if (node >= O2NM_MAX_NODES)
+                       break;
+               if (node == dlm->node_num)
+                       continue;
+
+               spin_unlock(&dlm->spinlock);
+               dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+               spin_lock(&dlm->spinlock);
+       }
+       spin_unlock(&dlm->spinlock);
+}
 
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
 
                clear_node = 1;
 
-               status = dlm_send_one_domain_exit(dlm, node);
+               status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+                                                 node);
                if (status < 0 &&
                    status != -ENOPROTOOPT &&
                    status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 
        if (leave) {
                mlog(0, "shutting down domain %s\n", dlm->name);
+               dlm_begin_exit_domain(dlm);
 
                /* We changed dlm state, notify the thread */
                dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                 * leftover join state. */
                BUG_ON(dlm->joining_node != assert->node_idx);
                set_bit(assert->node_idx, dlm->domain_map);
+               clear_bit(assert->node_idx, dlm->exit_domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        if (status)
                goto bail;
 
+       status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+                                       sizeof(struct dlm_exit_domain),
+                                       dlm_begin_exit_domain_handler,
+                                       dlm, NULL, &dlm->dlm_domain_handlers);
+       if (status)
+               goto bail;
+
 bail:
        if (status)
                dlm_unregister_domain_handlers(dlm);
index 84d1663..11eefb8 100644 (file)
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_put(res);
 }
 
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
  */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-                                     struct dlm_lock_resource *res,
-                                     int *numlocks,
-                                     int *hasrefs)
+                                     struct dlm_lock_resource *res)
 {
-       int ret;
-       int i;
-       int count = 0;
+       enum dlm_lockres_list idx;
+       int nonlocal = 0, node_ref;
        struct list_head *queue;
        struct dlm_lock *lock;
+       u64 cookie;
 
        assert_spin_locked(&res->spinlock);
 
-       *numlocks = 0;
-       *hasrefs = 0;
-
-       ret = -EINVAL;
-       if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
-               mlog(0, "cannot migrate lockres with unknown owner!\n");
-               goto leave;
-       }
-
-       if (res->owner != dlm->node_num) {
-               mlog(0, "cannot migrate lockres this node doesn't own!\n");
-               goto leave;
-       }
+       if (res->owner != dlm->node_num)
+               return 0;
 
-       ret = 0;
-       queue = &res->granted;
-       for (i = 0; i < 3; i++) {
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+               queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                       ++count;
-                       if (lock->ml.node == dlm->node_num) {
-                               mlog(0, "found a lock owned by this node still "
-                                    "on the %s queue!  will not migrate this "
-                                    "lockres\n", (i == 0 ? "granted" :
-                                                  (i == 1 ? "converting" :
-                                                   "blocked")));
-                               ret = -ENOTEMPTY;
-                               goto leave;
+                       if (lock->ml.node != dlm->node_num) {
+                               nonlocal++;
+                               continue;
                        }
+                       cookie = be64_to_cpu(lock->ml.cookie);
+                       mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+                            "%s list\n", dlm->name, res->lockname.len,
+                            res->lockname.name,
+                            dlm_get_lock_cookie_node(cookie),
+                            dlm_get_lock_cookie_seq(cookie),
+                            dlm_list_in_text(idx));
+                       return 0;
                }
-               queue++;
        }
 
-       *numlocks = count;
-
-       count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-       if (count < O2NM_MAX_NODES)
-               *hasrefs = 1;
+       if (!nonlocal) {
+               node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+               if (node_ref >= O2NM_MAX_NODES)
+                       return 0;
+       }
 
-       mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
-            res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+       mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+            res->lockname.name);
 
-leave:
-       return ret;
+       return 1;
 }
 
 /*
@@ -2406,8 +2396,7 @@ leave:
 
 
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res,
-                              u8 target)
+                              struct dlm_lock_resource *res, u8 target)
 {
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-       int numlocks, hasrefs;
        int wake = 0;
 
        if (!dlm_grab(dlm))
                return -EINVAL;
 
+       BUG_ON(target == O2NM_MAX_NODES);
+
        name = res->lockname.name;
        namelen = res->lockname.len;
 
-       mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
-
-       /*
-        * ensure this lockres is a proper candidate for migration
-        */
-       spin_lock(&res->spinlock);
-       ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-       if (ret < 0) {
-               spin_unlock(&res->spinlock);
-               goto leave;
-       }
-       spin_unlock(&res->spinlock);
-
-       /* no work to do */
-       if (numlocks == 0 && !hasrefs)
-               goto leave;
-
-       /*
-        * preallocate up front
-        * if this fails, abort
-        */
+       mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+            target);
 
+       /* preallocate up front. if this fails, abort */
        ret = -ENOMEM;
        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
@@ -2461,36 +2433,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        }
        ret = 0;
 
-       /*
-        * find a node to migrate the lockres to
-        */
-
-       spin_lock(&dlm->spinlock);
-       /* pick a new node */
-       if (!test_bit(target, dlm->domain_map) ||
-           target >= O2NM_MAX_NODES) {
-               target = dlm_pick_migration_target(dlm, res);
-       }
-       mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
-            namelen, name, target);
-
-       if (target >= O2NM_MAX_NODES ||
-           !test_bit(target, dlm->domain_map)) {
-               /* target chosen is not alive */
-               ret = -EINVAL;
-       }
-
-       if (ret) {
-               spin_unlock(&dlm->spinlock);
-               goto fail;
-       }
-
-       mlog(0, "continuing with target = %u\n", target);
-
        /*
         * clear any existing master requests and
         * add the migration mle to the list
         */
+       spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
                                    namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
                        dlm_put_mle(mle);
                } else if (mle) {
                        kmem_cache_free(dlm_mle_cache, mle);
+                       mle = NULL;
                }
                goto leave;
        }
@@ -2652,69 +2600,52 @@ leave:
        if (wake)
                wake_up(&res->wq);
 
-       /* TODO: cleanup */
        if (mres)
                free_page((unsigned long)mres);
 
        dlm_put(dlm);
 
-       mlog(0, "returning %d\n", ret);
+       mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+            name, target, ret);
        return ret;
 }
 
 #define DLM_MIGRATION_RETRY_MS  100
 
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
  * There should not be any remaining locks on nonlocal lock resources,
  * and there should be no local locks left on locally mastered resources.
  *
  * Called with the dlm spinlock held, may drop it to do migration, but
  * will re-acquire before exit.
  *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-       int numlocks, hasrefs;
+       u8 target = O2NM_MAX_NODES;
+
+       assert_spin_locked(&dlm->spinlock);
 
        spin_lock(&res->spinlock);
-       if (res->owner != dlm->node_num) {
-               if (!__dlm_lockres_unused(res)) {
-                       mlog(ML_ERROR, "%s:%.*s: this node is not master, "
-                            "trying to free this but locks remain\n",
-                            dlm->name, res->lockname.len, res->lockname.name);
-               }
-               spin_unlock(&res->spinlock);
-               goto leave;
-       }
+       if (dlm_is_lockres_migrateable(dlm, res))
+               target = dlm_pick_migration_target(dlm, res);
+       spin_unlock(&res->spinlock);
 
-       /* No need to migrate a lockres having no locks */
-       ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-       if (ret >= 0 && numlocks == 0 && !hasrefs) {
-               spin_unlock(&res->spinlock);
+       if (target == O2NM_MAX_NODES)
                goto leave;
-       }
-       spin_unlock(&res->spinlock);
 
        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
        spin_unlock(&dlm->spinlock);
        lock_dropped = 1;
-       while (1) {
-               ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
-               if (ret >= 0)
-                       break;
-               if (ret == -ENOTEMPTY) {
-                       mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-                               res->lockname.len, res->lockname.name);
-                       BUG();
-               }
-
-               mlog(0, "lockres %.*s: migrate failed, "
-                    "retrying\n", res->lockname.len,
-                    res->lockname.name);
-               msleep(DLM_MIGRATION_RETRY_MS);
-       }
+       ret = dlm_migrate_lockres(dlm, res, target);
+       if (ret)
+               mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+                    dlm->name, res->lockname.len, res->lockname.name,
+                    target, ret);
        spin_lock(&dlm->spinlock);
 leave:
        return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
        }
 }
 
-/* for now this is not too intelligent.  we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                                    struct dlm_lock_resource *res)
 {
-       int i;
+       enum dlm_lockres_list idx;
        struct list_head *queue = &res->granted;
        struct dlm_lock *lock;
-       int nodenum;
+       int noderef;
+       u8 nodenum = O2NM_MAX_NODES;
 
        assert_spin_locked(&dlm->spinlock);
+       assert_spin_locked(&res->spinlock);
 
-       spin_lock(&res->spinlock);
-       for (i=0; i<3; i++) {
+       /* Go through all the locks */
+       for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+               queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                       /* up to the caller to make sure this node
-                        * is alive */
-                       if (lock->ml.node != dlm->node_num) {
-                               spin_unlock(&res->spinlock);
-                               return lock->ml.node;
-                       }
+                       if (lock->ml.node == dlm->node_num)
+                               continue;
+                       if (test_bit(lock->ml.node, dlm->exit_domain_map))
+                               continue;
+                       nodenum = lock->ml.node;
+                       goto bail;
                }
-               queue++;
-       }
-
-       nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-       if (nodenum < O2NM_MAX_NODES) {
-               spin_unlock(&res->spinlock);
-               return nodenum;
        }
-       spin_unlock(&res->spinlock);
-       mlog(0, "have not found a suitable target yet! checking domain map\n");
 
-       /* ok now we're getting desperate.  pick anyone alive. */
-       nodenum = -1;
+       /* Go thru the refmap */
+       noderef = -1;
        while (1) {
-               nodenum = find_next_bit(dlm->domain_map,
-                                       O2NM_MAX_NODES, nodenum+1);
-               mlog(0, "found %d in domain map\n", nodenum);
-               if (nodenum >= O2NM_MAX_NODES)
+               noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+                                       noderef + 1);
+               if (noderef >= O2NM_MAX_NODES)
                        break;
-               if (nodenum != dlm->node_num) {
-                       mlog(0, "picking %d\n", nodenum);
-                       return nodenum;
-               }
+               if (noderef == dlm->node_num)
+                       continue;
+               if (test_bit(noderef, dlm->exit_domain_map))
+                       continue;
+               nodenum = noderef;
+               goto bail;
        }
 
-       mlog(0, "giving up.  no master to migrate to\n");
-       return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+       return nodenum;
 }
 
-
-
 /* this is called by the new master once all lockres
  * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
index f1beb6f..7efab6d 100644 (file)
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 
        mlog(0, "node %u being removed from domain map!\n", idx);
        clear_bit(idx, dlm->domain_map);
+       clear_bit(idx, dlm->exit_domain_map);
        /* wake up migration waiters if a node goes down.
         * perhaps later we can genericize this for other waiters. */
        wake_up(&dlm->migration_wq);
index 8c5c0ed..b420767 100644 (file)
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
  *               signifies a bast fired on the lock.
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
                                        struct kernel_param *kp)
 {
        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
index 89659d6..b1e35a3 100644 (file)
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+       .fallocate      = ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
index 5910059..bc91072 100644 (file)
@@ -952,6 +952,29 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        return -EFAULT;
 
                return ocfs2_info_handle(inode, &info, 0);
+       case FITRIM:
+       {
+               struct super_block *sb = inode->i_sb;
+               struct fstrim_range range;
+               int ret = 0;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (copy_from_user(&range, (struct fstrim_range *)arg,
+                   sizeof(range)))
+                       return -EFAULT;
+
+               ret = ocfs2_trim_fs(sb, &range);
+               if (ret < 0)
+                       return ret;
+
+               if (copy_to_user((struct fstrim_range *)arg, &range,
+                   sizeof(range)))
+                       return -EFAULT;
+
+               return 0;
+       }
        case OCFS2_IOC_MOVE_EXT:
                return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
        default:
@@ -981,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_GROUP_EXTEND:
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
+       case FITRIM:
                break;
        case OCFS2_IOC_REFLINK:
                if (copy_from_user(&args, (struct reflink_arguments *)arg,
index a1dae5b..3b481f4 100644 (file)
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
                  __entry->blkno, __entry->bit)
 );
 
+TRACE_EVENT(ocfs2_trim_extent,
+       TP_PROTO(struct super_block *sb, unsigned long long blk,
+                unsigned long long count),
+       TP_ARGS(sb, blk, count),
+       TP_STRUCT__entry(
+               __field(int, dev_major)
+               __field(int, dev_minor)
+               __field(unsigned long long, blk)
+               __field(__u64,  count)
+       ),
+       TP_fast_assign(
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
+               __entry->blk = blk;
+               __entry->count = count;
+       ),
+       TP_printk("%d %d %llu %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
 /* End of trace events for fs/ocfs2/alloc.c. */
 
 /* Trace events for fs/ocfs2/localalloc.c. */
index 5a521c7..823bc35 100644 (file)
@@ -1566,7 +1566,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->preferred_slot != OCFS2_INVALID_SLOT)
                seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 
-       if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+       if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
        if (osb->osb_commit_interval)