#include <linux/crc32.h>
#include <linux/debugfs.h>
#include <linux/mount.h>
-
-#include <cluster/nodemanager.h>
+#include <linux/seq_file.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "sysfile.h"
#include "uptodate.h"
#include "ver.h"
-#include "vote.h"
#include "buffer_head_io.h"
struct mount_options
{
+ unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
+ unsigned int localalloc_opt;
+ char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
static int ocfs2_parse_options(struct super_block *sb, char *options,
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
static int ocfs2_check_volume(struct ocfs2_super *osb);
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
Opt_data_writeback,
Opt_atime_quantum,
Opt_slot,
+ Opt_commit,
+ Opt_localalloc,
+ Opt_localflocks,
+ Opt_stack,
Opt_err,
};
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
{Opt_slot, "preferred_slot=%u"},
+ {Opt_commit, "commit=%u"},
+ {Opt_localalloc, "localalloc=%d"},
+ {Opt_localflocks, "localflocks"},
+ {Opt_stack, "cluster_stack=%s"},
{Opt_err, NULL}
};
mlog_entry_void();
- new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
}
osb->root_inode = new;
- new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
+ if (parsed_options.commit_interval)
+ osb->osb_commit_interval = parsed_options.commit_interval;
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
}
}
+ if (ocfs2_userspace_stack(osb)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ mlog(ML_ERROR, "Userspace stack expected, but "
+ "o2cb heartbeat arguments passed to mount\n");
+ return -EINVAL;
+ }
+ }
+
if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
- if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+ if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+ !ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
"a read-write clustered device.\n");
return -EINVAL;
return 0;
}
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk. If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+ struct mount_options *mopt)
+{
+ if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount, but this filesystem "
+ "does not support it\n");
+ return -EINVAL;
+ }
+
+ if (ocfs2_userspace_stack(osb) &&
+ strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount (\"%s\") does not "
+ "match the filesystem (\"%s\")\n",
+ mopt->cluster_stack,
+ osb->osb_cluster_stack);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
goto read_super_error;
}
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
- if (!o2hb_check_local_node_heartbeating()) {
- status = -EINVAL;
- goto read_super_error;
- }
- }
-
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, §or_size);
if (status < 0) {
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
+ osb->osb_commit_interval = parsed_options.commit_interval;
+ osb->local_alloc_size = parsed_options.localalloc_opt;
+
+ status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+ if (status)
+ goto read_super_error;
sb->s_magic = OCFS2_SUPER_MAGIC;
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
- snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
"with %s data mode.\n",
mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
options ? options : "(none)");
+ mopt->commit_interval = 0;
mopt->mount_opt = 0;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
+ mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->cluster_stack[0] = '\0';
if (!options) {
status = 1;
if (option)
mopt->slot = (s16)option;
break;
+ case Opt_commit:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option < 0)
+ return 0;
+ if (option == 0)
+ option = JBD_DEFAULT_MAX_COMMIT_AGE;
+ mopt->commit_interval = HZ * option;
+ break;
+ case Opt_localalloc:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ mopt->localalloc_opt = option;
+ break;
+ case Opt_localflocks:
+ /*
+ * Changing this during remount could race
+ * flock() requests, or "unbalance" existing
+ * ones (e.g., a lock is taken in one mode but
+ * dropped in the other). If users care enough
+ * to flip locking modes during remount, we
+ * could add a "local" flag to individual
+ * flock structures for proper tracking of
+ * state.
+ */
+ if (!is_remount)
+ mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+ break;
+ case Opt_stack:
+ /* Check both that the option we were passed
+ * is of the right length and that it is a proper
+ * string of the right length.
+ */
+ if (((args[0].to - args[0].from) !=
+ OCFS2_STACK_LABEL_LEN) ||
+ (strnlen(args[0].from,
+ OCFS2_STACK_LABEL_LEN) !=
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "Invalid cluster_stack option\n");
+ status = 0;
+ goto bail;
+ }
+ memcpy(mopt->cluster_stack, args[0].from,
+ OCFS2_STACK_LABEL_LEN);
+ mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+ if (osb->osb_commit_interval)
+ seq_printf(s, ",commit=%u",
+ (unsigned) (osb->osb_commit_interval / HZ));
+
+ if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+
+ if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+ seq_printf(s, ",localflocks,");
+
+ if (osb->osb_cluster_stack[0])
+ seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+ osb->osb_cluster_stack);
+
return 0;
}
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
+ ocfs2_set_locking_protocol();
+
leave:
if (status < 0) {
ocfs2_free_mem_caches();
goto bail;
}
- status = ocfs2_meta_lock(inode, &bh, 0);
+ status = ocfs2_inode_lock(inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
brelse(bh);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
if (inode)
oi->ip_clusters = 0;
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
- ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
- ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);
return 0;
}
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
- int status;
-
- /* XXX hold a ref on the node while mounte? easy enough, if
- * desirable. */
- if (ocfs2_mount_local(osb))
- osb->node_num = 0;
- else
- osb->node_num = o2nm_this_node();
-
- if (osb->node_num == O2NM_MAX_NODES) {
- mlog(ML_ERROR, "could not find this host's node number\n");
- status = -ENOENT;
- goto bail;
- }
-
- mlog(0, "I am node %d\n", osb->node_num);
-
- status = 0;
-bail:
- return status;
-}
-
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
if (ocfs2_is_hard_readonly(osb))
goto leave;
- status = ocfs2_fill_local_node_info(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- status = ocfs2_register_hb_callbacks(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- /* requires vote_thread to be running. */
- status = ocfs2_register_net_handlers(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- ocfs2_populate_mounted_map(osb);
-
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
if (ocfs2_mount_local(osb))
goto leave;
- /* This should be sent *after* we recovered our journal as it
- * will cause other nodes to unmark us as needing
- * recovery. However, we need to send it *before* dropping the
- * super block lock as otherwise their recovery threads might
- * try to clean us up while we're live! */
- status = ocfs2_request_mount_vote(osb);
- if (status < 0)
- mlog_errno(status);
-
leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
return status;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
- mb();
- return osb->recovery_thread_task != NULL;
-}
-
static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
- int tmp;
+ int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
char nodestr[8];
ocfs2_truncate_log_shutdown(osb);
- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
- mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
- mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
- flush_workqueue(ocfs2_wq);
+ /* This will disable recovery and flush any recovery work. */
+ ocfs2_recovery_exit(osb);
ocfs2_journal_shutdown(osb);
ocfs2_sync_blockdev(sb);
- /* No dlm means we've failed during mount, so skip all the
- * steps which depended on that to complete. */
- if (osb->dlm) {
+ /* No cluster connection means we've failed during mount, so skip
+ * all the steps which depended on that to complete. */
+ if (osb->cconn) {
tmp = ocfs2_super_lock(osb, 1);
if (tmp < 0) {
mlog_errno(tmp);
return;
}
-
- tmp = ocfs2_request_umount_vote(osb);
- if (tmp < 0)
- mlog_errno(tmp);
}
if (osb->slot_num != OCFS2_INVALID_SLOT)
ocfs2_put_slot(osb);
- if (osb->dlm)
+ if (osb->cconn)
ocfs2_super_unlock(osb, 1);
ocfs2_release_system_inodes(osb);
- if (osb->dlm) {
- ocfs2_unregister_net_handlers(osb);
-
- ocfs2_dlm_shutdown(osb);
- }
+ /*
+ * If we're dismounting due to mount error, mount.ocfs2 will clean
+ * up heartbeat. If we're a local mount, there is no heartbeat.
+ * If we failed before we got a uuid_str yet, we can't stop
+ * heartbeat. Otherwise, do it.
+ */
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ hangup_needed = 1;
- ocfs2_clear_hb_callbacks(osb);
+ if (osb->cconn)
+ ocfs2_dlm_shutdown(osb, hangup_needed);
debugfs_remove(osb->osb_debug_root);
- if (!mnt_err)
- ocfs2_stop_heartbeat(osb);
+ if (hangup_needed)
+ ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
- snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
osb->dev_str, nodestr);
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct buffer_head *bitmap_bh = NULL;
struct ocfs2_journal *journal;
__le32 uuid_net_key;
struct ocfs2_super *osb;
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
- osb->net_response_ids = 0;
- spin_lock_init(&osb->net_response_lock);
- INIT_LIST_HEAD(&osb->net_response_list);
-
- INIT_LIST_HEAD(&osb->osb_net_handlers);
- init_waitqueue_head(&osb->recovery_event);
- spin_lock_init(&osb->vote_task_lock);
- init_waitqueue_head(&osb->vote_event);
- osb->vote_work_sequence = 0;
- osb->vote_wake_sequence = 0;
+ spin_lock_init(&osb->dc_task_lock);
+ init_waitqueue_head(&osb->dc_event);
+ osb->dc_work_sequence = 0;
+ osb->dc_wake_sequence = 0;
INIT_LIST_HEAD(&osb->blocked_lock_list);
osb->blocked_lock_count = 0;
- INIT_LIST_HEAD(&osb->vote_list);
spin_lock_init(&osb->osb_lock);
+ ocfs2_init_inode_steal_slot(osb);
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
- mutex_init(&osb->recovery_lock);
-
- osb->disable_recovery = 0;
- osb->recovery_thread_task = NULL;
+ status = ocfs2_recovery_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize recovery state\n");
+ mlog_errno(status);
+ goto bail;
+ }
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
- osb->node_num = O2NM_INVALID_NODE_NUM;
osb->slot_num = OCFS2_INVALID_SLOT;
osb->local_alloc_state = OCFS2_LA_UNUSED;
osb->local_alloc_bh = NULL;
- ocfs2_setup_hb_callbacks(osb);
-
init_waitqueue_head(&osb->osb_mount_event);
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
goto bail;
}
+ if (ocfs2_userspace_stack(osb)) {
+ memcpy(osb->osb_cluster_stack,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+ OCFS2_STACK_LABEL_LEN);
+ osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+ mlog(ML_ERROR,
+ "couldn't mount because of an invalid "
+ "cluster stack label (%s) \n",
+ osb->osb_cluster_stack);
+ status = -EINVAL;
+ goto bail;
+ }
+ } else {
+ /* The empty string is identical with classic tools that
+ * don't know about s_cluster_info. */
+ osb->osb_cluster_stack[0] = '\0';
+ }
+
get_random_bytes(&osb->s_next_generation, sizeof(u32));
/* FIXME
}
memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
- osb->net_key = le32_to_cpu(uuid_net_key);
strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
osb->vol_label[63] = '\0';
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-
- /* We don't have a cluster lock on the bitmap here because
- * we're only interested in static information and the extra
- * complexity at mount time isn't worht it. Don't pass the
- * inode in to the read function though as we don't want it to
- * be put in the cache. */
- status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
- NULL);
iput(inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- di = (struct ocfs2_dinode *) bitmap_bh->b_data;
- osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
- brelse(bitmap_bh);
- mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
- (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
/* This function assumes that the caller has the main osb resource */
- if (osb->slot_info)
- ocfs2_free_slot_info(osb->slot_info);
+ ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
/* FIXME