ocfs2_dlm: Add timeout to dlm join domain
authorSunil Mushran <sunil.mushran@oracle.com>
Mon, 29 Jan 2007 23:44:27 +0000 (15:44 -0800)
committerMark Fasheh <mark.fasheh@oracle.com>
Wed, 7 Feb 2007 20:10:39 +0000 (12:10 -0800)
Currently the ocfs2 dlm has no timeout during dlm join domain. While this is
not a problem in normal operation, this does become an issue if, say, the
other node is refusing to let the node join the domain because of a stuck
recovery. This patch adds a 90 sec timeout.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/dlm/dlmdomain.c

index e8ecf8c..6087c47 100644 (file)
@@ -1264,6 +1264,8 @@ bail:
 static int dlm_join_domain(struct dlm_ctxt *dlm)
 {
        int status;
+       unsigned int backoff;
+       unsigned int total_backoff = 0;
 
        BUG_ON(!dlm);
 
@@ -1295,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
        }
 
        do {
-               unsigned int backoff;
                status = dlm_try_to_join_domain(dlm);
 
                /* If we're racing another node to the join, then we
                 * need to back off temporarily and let them
                 * complete. */
+#define        DLM_JOIN_TIMEOUT_MSECS  90000
                if (status == -EAGAIN) {
                        if (signal_pending(current)) {
                                status = -ERESTARTSYS;
                                goto bail;
                        }
 
+                       if (total_backoff >
+                           msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+                               status = -ERESTARTSYS;
+                               mlog(ML_NOTICE, "Timed out joining dlm domain "
+                                    "%s after %u msecs\n", dlm->name,
+                                    jiffies_to_msecs(total_backoff));
+                               goto bail;
+                       }
+
                        /*
                         * <chip> After you!
                         * <dale> No, after you!
@@ -1316,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                         */
                        backoff = (unsigned int)(jiffies & 0x3);
                        backoff *= DLM_DOMAIN_BACKOFF_MS;
+                       total_backoff += backoff;
                        mlog(0, "backoff %d\n", backoff);
                        msleep(backoff);
                }