drbd: check MODULE for THIS_MODULE
[pandora-kernel.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89                  __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
90 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92 #include <linux/moduleparam.h>
93 /* allow_open_on_secondary */
94 MODULE_PARM_DESC(allow_oos, "DONT USE!");
95 /* thanks to these macros, if compiled into the kernel (not-module),
96  * this becomes the boot parameter drbd.minor_count */
97 module_param(minor_count, uint, 0444);
98 module_param(disable_sendpage, bool, 0644);
99 module_param(allow_oos, bool, 0);
100 module_param(cn_idx, uint, 0444);
101 module_param(proc_details, int, 0644);
102
103 #ifdef CONFIG_DRBD_FAULT_INJECTION
104 int enable_faults;
105 int fault_rate;
106 static int fault_count;
107 int fault_devs;
108 /* bitmap of enabled faults */
109 module_param(enable_faults, int, 0664);
110 /* fault rate % value - applies to all enabled faults */
111 module_param(fault_rate, int, 0664);
112 /* count of faults inserted */
113 module_param(fault_count, int, 0664);
114 /* bitmap of devices to insert faults on */
115 module_param(fault_devs, int, 0644);
116 #endif
117
118 /* module parameter, defined */
119 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
120 int disable_sendpage;
121 int allow_oos;
122 unsigned int cn_idx = CN_IDX_DRBD;
123 int proc_details;       /* Detail level in proc drbd*/
124
125 /* Module parameter for setting the user mode helper program
126  * to run. Default is /sbin/drbdadm */
127 char usermode_helper[80] = "/sbin/drbdadm";
128
129 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
132  * as member "struct gendisk *vdisk;"
133  */
134 struct drbd_conf **minor_table;
135
136 struct kmem_cache *drbd_request_cache;
137 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
138 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
139 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
140 mempool_t *drbd_request_mempool;
141 mempool_t *drbd_ee_mempool;
142
143 /* I do not use a standard mempool, because:
144    1) I want to hand out the pre-allocated objects first.
145    2) I want to be able to interrupt sleeping allocation with a signal.
146    Note: This is a single linked list, the next pointer is the private
147          member of struct page.
148  */
149 struct page *drbd_pp_pool;
150 spinlock_t   drbd_pp_lock;
151 int          drbd_pp_vacant;
152 wait_queue_head_t drbd_pp_wait;
153
154 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
156 static const struct block_device_operations drbd_ops = {
157         .owner =   THIS_MODULE,
158         .open =    drbd_open,
159         .release = drbd_release,
160 };
161
162 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164 #ifdef __CHECKER__
165 /* When checking with sparse, and this is an inline function, sparse will
166    give tons of false positives. When this is a real functions sparse works.
167  */
168 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169 {
170         int io_allowed;
171
172         atomic_inc(&mdev->local_cnt);
173         io_allowed = (mdev->state.disk >= mins);
174         if (!io_allowed) {
175                 if (atomic_dec_and_test(&mdev->local_cnt))
176                         wake_up(&mdev->misc_wait);
177         }
178         return io_allowed;
179 }
180
181 #endif
182
183 /**
184  * DOC: The transfer log
185  *
186  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188  * of the list. There is always at least one &struct drbd_tl_epoch object.
189  *
190  * Each &struct drbd_tl_epoch has a circular double linked list of requests
191  * attached.
192  */
193 static int tl_init(struct drbd_conf *mdev)
194 {
195         struct drbd_tl_epoch *b;
196
197         /* during device minor initialization, we may well use GFP_KERNEL */
198         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199         if (!b)
200                 return 0;
201         INIT_LIST_HEAD(&b->requests);
202         INIT_LIST_HEAD(&b->w.list);
203         b->next = NULL;
204         b->br_number = 4711;
205         b->n_writes = 0;
206         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208         mdev->oldest_tle = b;
209         mdev->newest_tle = b;
210         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212         mdev->tl_hash = NULL;
213         mdev->tl_hash_s = 0;
214
215         return 1;
216 }
217
218 static void tl_cleanup(struct drbd_conf *mdev)
219 {
220         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222         kfree(mdev->oldest_tle);
223         mdev->oldest_tle = NULL;
224         kfree(mdev->unused_spare_tle);
225         mdev->unused_spare_tle = NULL;
226         kfree(mdev->tl_hash);
227         mdev->tl_hash = NULL;
228         mdev->tl_hash_s = 0;
229 }
230
231 /**
232  * _tl_add_barrier() - Adds a barrier to the transfer log
233  * @mdev:       DRBD device.
234  * @new:        Barrier to be added before the current head of the TL.
235  *
236  * The caller must hold the req_lock.
237  */
238 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
239 {
240         struct drbd_tl_epoch *newest_before;
241
242         INIT_LIST_HEAD(&new->requests);
243         INIT_LIST_HEAD(&new->w.list);
244         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
245         new->next = NULL;
246         new->n_writes = 0;
247
248         newest_before = mdev->newest_tle;
249         /* never send a barrier number == 0, because that is special-cased
250          * when using TCQ for our write ordering code */
251         new->br_number = (newest_before->br_number+1) ?: 1;
252         if (mdev->newest_tle != new) {
253                 mdev->newest_tle->next = new;
254                 mdev->newest_tle = new;
255         }
256 }
257
258 /**
259  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
260  * @mdev:       DRBD device.
261  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
262  * @set_size:   Expected number of requests before that barrier.
263  *
264  * In case the passed barrier_nr or set_size does not match the oldest
265  * &struct drbd_tl_epoch objects this function will cause a termination
266  * of the connection.
267  */
268 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
269                        unsigned int set_size)
270 {
271         struct drbd_tl_epoch *b, *nob; /* next old barrier */
272         struct list_head *le, *tle;
273         struct drbd_request *r;
274
275         spin_lock_irq(&mdev->req_lock);
276
277         b = mdev->oldest_tle;
278
279         /* first some paranoia code */
280         if (b == NULL) {
281                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
282                         barrier_nr);
283                 goto bail;
284         }
285         if (b->br_number != barrier_nr) {
286                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
287                         barrier_nr, b->br_number);
288                 goto bail;
289         }
290         if (b->n_writes != set_size) {
291                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
292                         barrier_nr, set_size, b->n_writes);
293                 goto bail;
294         }
295
296         /* Clean up list of requests processed during current epoch */
297         list_for_each_safe(le, tle, &b->requests) {
298                 r = list_entry(le, struct drbd_request, tl_requests);
299                 _req_mod(r, barrier_acked);
300         }
301         /* There could be requests on the list waiting for completion
302            of the write to the local disk. To avoid corruptions of
303            slab's data structures we have to remove the lists head.
304
305            Also there could have been a barrier ack out of sequence, overtaking
306            the write acks - which would be a bug and violating write ordering.
307            To not deadlock in case we lose connection while such requests are
308            still pending, we need some way to find them for the
309            _req_mode(connection_lost_while_pending).
310
311            These have been list_move'd to the out_of_sequence_requests list in
312            _req_mod(, barrier_acked) above.
313            */
314         list_del_init(&b->requests);
315
316         nob = b->next;
317         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
318                 _tl_add_barrier(mdev, b);
319                 if (nob)
320                         mdev->oldest_tle = nob;
321                 /* if nob == NULL b was the only barrier, and becomes the new
322                    barrier. Therefore mdev->oldest_tle points already to b */
323         } else {
324                 D_ASSERT(nob != NULL);
325                 mdev->oldest_tle = nob;
326                 kfree(b);
327         }
328
329         spin_unlock_irq(&mdev->req_lock);
330         dec_ap_pending(mdev);
331
332         return;
333
334 bail:
335         spin_unlock_irq(&mdev->req_lock);
336         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
337 }
338
339
340 /**
341  * _tl_restart() - Walks the transfer log, and applies an action to all requests
342  * @mdev:       DRBD device.
343  * @what:       The action/event to perform with all request objects
344  *
345  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
346  * restart_frozen_disk_io.
347  */
348 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
349 {
350         struct drbd_tl_epoch *b, *tmp, **pn;
351         struct list_head *le, *tle, carry_reads;
352         struct drbd_request *req;
353         int rv, n_writes, n_reads;
354
355         b = mdev->oldest_tle;
356         pn = &mdev->oldest_tle;
357         while (b) {
358                 n_writes = 0;
359                 n_reads = 0;
360                 INIT_LIST_HEAD(&carry_reads);
361                 list_for_each_safe(le, tle, &b->requests) {
362                         req = list_entry(le, struct drbd_request, tl_requests);
363                         rv = _req_mod(req, what);
364
365                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
367                 }
368                 tmp = b->next;
369
370                 if (n_writes) {
371                         if (what == resend) {
372                                 b->n_writes = n_writes;
373                                 if (b->w.cb == NULL) {
374                                         b->w.cb = w_send_barrier;
375                                         inc_ap_pending(mdev);
376                                         set_bit(CREATE_BARRIER, &mdev->flags);
377                                 }
378
379                                 drbd_queue_work(&mdev->data.work, &b->w);
380                         }
381                         pn = &b->next;
382                 } else {
383                         if (n_reads)
384                                 list_add(&carry_reads, &b->requests);
385                         /* there could still be requests on that ring list,
386                          * in case local io is still pending */
387                         list_del(&b->requests);
388
389                         /* dec_ap_pending corresponding to queue_barrier.
390                          * the newest barrier may not have been queued yet,
391                          * in which case w.cb is still NULL. */
392                         if (b->w.cb != NULL)
393                                 dec_ap_pending(mdev);
394
395                         if (b == mdev->newest_tle) {
396                                 /* recycle, but reinit! */
397                                 D_ASSERT(tmp == NULL);
398                                 INIT_LIST_HEAD(&b->requests);
399                                 list_splice(&carry_reads, &b->requests);
400                                 INIT_LIST_HEAD(&b->w.list);
401                                 b->w.cb = NULL;
402                                 b->br_number = net_random();
403                                 b->n_writes = 0;
404
405                                 *pn = b;
406                                 break;
407                         }
408                         *pn = tmp;
409                         kfree(b);
410                 }
411                 b = tmp;
412                 list_splice(&carry_reads, &b->requests);
413         }
414 }
415
416
417 /**
418  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419  * @mdev:       DRBD device.
420  *
421  * This is called after the connection to the peer was lost. The storage covered
422  * by the requests on the transfer gets marked as our of sync. Called from the
423  * receiver thread and the worker thread.
424  */
425 void tl_clear(struct drbd_conf *mdev)
426 {
427         struct list_head *le, *tle;
428         struct drbd_request *r;
429
430         spin_lock_irq(&mdev->req_lock);
431
432         _tl_restart(mdev, connection_lost_while_pending);
433
434         /* we expect this list to be empty. */
435         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
436
437         /* but just in case, clean it up anyways! */
438         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
439                 r = list_entry(le, struct drbd_request, tl_requests);
440                 /* It would be nice to complete outside of spinlock.
441                  * But this is easier for now. */
442                 _req_mod(r, connection_lost_while_pending);
443         }
444
445         /* ensure bit indicating barrier is required is clear */
446         clear_bit(CREATE_BARRIER, &mdev->flags);
447
448         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449
450         spin_unlock_irq(&mdev->req_lock);
451 }
452
453 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454 {
455         spin_lock_irq(&mdev->req_lock);
456         _tl_restart(mdev, what);
457         spin_unlock_irq(&mdev->req_lock);
458 }
459
460 /**
461  * cl_wide_st_chg() - true if the state change is a cluster wide one
462  * @mdev:       DRBD device.
463  * @os:         old (current) state.
464  * @ns:         new (wanted) state.
465  */
466 static int cl_wide_st_chg(struct drbd_conf *mdev,
467                           union drbd_state os, union drbd_state ns)
468 {
469         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
470                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
474                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476 }
477
478 enum drbd_state_rv
479 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480                   union drbd_state mask, union drbd_state val)
481 {
482         unsigned long flags;
483         union drbd_state os, ns;
484         enum drbd_state_rv rv;
485
486         spin_lock_irqsave(&mdev->req_lock, flags);
487         os = mdev->state;
488         ns.i = (os.i & ~mask.i) | val.i;
489         rv = _drbd_set_state(mdev, ns, f, NULL);
490         ns = mdev->state;
491         spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493         return rv;
494 }
495
496 /**
497  * drbd_force_state() - Impose a change which happens outside our control on our state
498  * @mdev:       DRBD device.
499  * @mask:       mask of state bits to change.
500  * @val:        value of new state bits.
501  */
502 void drbd_force_state(struct drbd_conf *mdev,
503         union drbd_state mask, union drbd_state val)
504 {
505         drbd_change_state(mdev, CS_HARD, mask, val);
506 }
507
508 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510                                                     union drbd_state,
511                                                     union drbd_state);
512 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
513                                        union drbd_state ns, const char **warn_sync_abort);
514 int drbd_send_state_req(struct drbd_conf *,
515                         union drbd_state, union drbd_state);
516
517 static enum drbd_state_rv
518 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
519              union drbd_state val)
520 {
521         union drbd_state os, ns;
522         unsigned long flags;
523         enum drbd_state_rv rv;
524
525         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
526                 return SS_CW_SUCCESS;
527
528         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
529                 return SS_CW_FAILED_BY_PEER;
530
531         rv = 0;
532         spin_lock_irqsave(&mdev->req_lock, flags);
533         os = mdev->state;
534         ns.i = (os.i & ~mask.i) | val.i;
535         ns = sanitize_state(mdev, os, ns, NULL);
536
537         if (!cl_wide_st_chg(mdev, os, ns))
538                 rv = SS_CW_NO_NEED;
539         if (!rv) {
540                 rv = is_valid_state(mdev, ns);
541                 if (rv == SS_SUCCESS) {
542                         rv = is_valid_state_transition(mdev, ns, os);
543                         if (rv == SS_SUCCESS)
544                                 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
545                 }
546         }
547         spin_unlock_irqrestore(&mdev->req_lock, flags);
548
549         return rv;
550 }
551
552 /**
553  * drbd_req_state() - Perform an eventually cluster wide state change
554  * @mdev:       DRBD device.
555  * @mask:       mask of state bits to change.
556  * @val:        value of new state bits.
557  * @f:          flags
558  *
559  * Should not be called directly, use drbd_request_state() or
560  * _drbd_request_state().
561  */
562 static enum drbd_state_rv
563 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
564                union drbd_state val, enum chg_state_flags f)
565 {
566         struct completion done;
567         unsigned long flags;
568         union drbd_state os, ns;
569         enum drbd_state_rv rv;
570
571         init_completion(&done);
572
573         if (f & CS_SERIALIZE)
574                 mutex_lock(&mdev->state_mutex);
575
576         spin_lock_irqsave(&mdev->req_lock, flags);
577         os = mdev->state;
578         ns.i = (os.i & ~mask.i) | val.i;
579         ns = sanitize_state(mdev, os, ns, NULL);
580
581         if (cl_wide_st_chg(mdev, os, ns)) {
582                 rv = is_valid_state(mdev, ns);
583                 if (rv == SS_SUCCESS)
584                         rv = is_valid_state_transition(mdev, ns, os);
585                 spin_unlock_irqrestore(&mdev->req_lock, flags);
586
587                 if (rv < SS_SUCCESS) {
588                         if (f & CS_VERBOSE)
589                                 print_st_err(mdev, os, ns, rv);
590                         goto abort;
591                 }
592
593                 drbd_state_lock(mdev);
594                 if (!drbd_send_state_req(mdev, mask, val)) {
595                         drbd_state_unlock(mdev);
596                         rv = SS_CW_FAILED_BY_PEER;
597                         if (f & CS_VERBOSE)
598                                 print_st_err(mdev, os, ns, rv);
599                         goto abort;
600                 }
601
602                 wait_event(mdev->state_wait,
603                         (rv = _req_st_cond(mdev, mask, val)));
604
605                 if (rv < SS_SUCCESS) {
606                         drbd_state_unlock(mdev);
607                         if (f & CS_VERBOSE)
608                                 print_st_err(mdev, os, ns, rv);
609                         goto abort;
610                 }
611                 spin_lock_irqsave(&mdev->req_lock, flags);
612                 os = mdev->state;
613                 ns.i = (os.i & ~mask.i) | val.i;
614                 rv = _drbd_set_state(mdev, ns, f, &done);
615                 drbd_state_unlock(mdev);
616         } else {
617                 rv = _drbd_set_state(mdev, ns, f, &done);
618         }
619
620         spin_unlock_irqrestore(&mdev->req_lock, flags);
621
622         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
623                 D_ASSERT(current != mdev->worker.task);
624                 wait_for_completion(&done);
625         }
626
627 abort:
628         if (f & CS_SERIALIZE)
629                 mutex_unlock(&mdev->state_mutex);
630
631         return rv;
632 }
633
634 /**
635  * _drbd_request_state() - Request a state change (with flags)
636  * @mdev:       DRBD device.
637  * @mask:       mask of state bits to change.
638  * @val:        value of new state bits.
639  * @f:          flags
640  *
641  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
642  * flag, or when logging of failed state change requests is not desired.
643  */
644 enum drbd_state_rv
645 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
646                     union drbd_state val, enum chg_state_flags f)
647 {
648         enum drbd_state_rv rv;
649
650         wait_event(mdev->state_wait,
651                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
652
653         return rv;
654 }
655
656 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
657 {
658         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
659             name,
660             drbd_conn_str(ns.conn),
661             drbd_role_str(ns.role),
662             drbd_role_str(ns.peer),
663             drbd_disk_str(ns.disk),
664             drbd_disk_str(ns.pdsk),
665             is_susp(ns) ? 's' : 'r',
666             ns.aftr_isp ? 'a' : '-',
667             ns.peer_isp ? 'p' : '-',
668             ns.user_isp ? 'u' : '-'
669             );
670 }
671
672 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
673                   union drbd_state ns, enum drbd_state_rv err)
674 {
675         if (err == SS_IN_TRANSIENT_STATE)
676                 return;
677         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
678         print_st(mdev, " state", os);
679         print_st(mdev, "wanted", ns);
680 }
681
682
683 /**
684  * is_valid_state() - Returns an SS_ error code if ns is not valid
685  * @mdev:       DRBD device.
686  * @ns:         State to consider.
687  */
688 static enum drbd_state_rv
689 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
690 {
691         /* See drbd_state_sw_errors in drbd_strings.c */
692
693         enum drbd_fencing_p fp;
694         enum drbd_state_rv rv = SS_SUCCESS;
695
696         fp = FP_DONT_CARE;
697         if (get_ldev(mdev)) {
698                 fp = mdev->ldev->dc.fencing;
699                 put_ldev(mdev);
700         }
701
702         if (get_net_conf(mdev)) {
703                 if (!mdev->net_conf->two_primaries &&
704                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
705                         rv = SS_TWO_PRIMARIES;
706                 put_net_conf(mdev);
707         }
708
709         if (rv <= 0)
710                 /* already found a reason to abort */;
711         else if (ns.role == R_SECONDARY && mdev->open_cnt)
712                 rv = SS_DEVICE_IN_USE;
713
714         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
715                 rv = SS_NO_UP_TO_DATE_DISK;
716
717         else if (fp >= FP_RESOURCE &&
718                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
719                 rv = SS_PRIMARY_NOP;
720
721         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
722                 rv = SS_NO_UP_TO_DATE_DISK;
723
724         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
725                 rv = SS_NO_LOCAL_DISK;
726
727         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
728                 rv = SS_NO_REMOTE_DISK;
729
730         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
731                 rv = SS_NO_UP_TO_DATE_DISK;
732
733         else if ((ns.conn == C_CONNECTED ||
734                   ns.conn == C_WF_BITMAP_S ||
735                   ns.conn == C_SYNC_SOURCE ||
736                   ns.conn == C_PAUSED_SYNC_S) &&
737                   ns.disk == D_OUTDATED)
738                 rv = SS_CONNECTED_OUTDATES;
739
740         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
741                  (mdev->sync_conf.verify_alg[0] == 0))
742                 rv = SS_NO_VERIFY_ALG;
743
744         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745                   mdev->agreed_pro_version < 88)
746                 rv = SS_NOT_SUPPORTED;
747
748         else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
749                 rv = SS_CONNECTED_OUTDATES;
750
751         return rv;
752 }
753
754 /**
755  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
756  * @mdev:       DRBD device.
757  * @ns:         new state.
758  * @os:         old state.
759  */
760 static enum drbd_state_rv
761 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
762                           union drbd_state os)
763 {
764         enum drbd_state_rv rv = SS_SUCCESS;
765
766         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
767             os.conn > C_CONNECTED)
768                 rv = SS_RESYNC_RUNNING;
769
770         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
771                 rv = SS_ALREADY_STANDALONE;
772
773         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
774                 rv = SS_IS_DISKLESS;
775
776         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
777                 rv = SS_NO_NET_CONFIG;
778
779         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
780                 rv = SS_LOWER_THAN_OUTDATED;
781
782         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
783                 rv = SS_IN_TRANSIENT_STATE;
784
785         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
786                 rv = SS_IN_TRANSIENT_STATE;
787
788         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
789                 rv = SS_NEED_CONNECTION;
790
791         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
792             ns.conn != os.conn && os.conn > C_CONNECTED)
793                 rv = SS_RESYNC_RUNNING;
794
795         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
796             os.conn < C_CONNECTED)
797                 rv = SS_NEED_CONNECTION;
798
799         if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
800             && os.conn < C_WF_REPORT_PARAMS)
801                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
802
803         return rv;
804 }
805
806 /**
807  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
808  * @mdev:       DRBD device.
809  * @os:         old state.
810  * @ns:         new state.
811  * @warn_sync_abort:
812  *
813  * When we loose connection, we have to set the state of the peers disk (pdsk)
814  * to D_UNKNOWN. This rule and many more along those lines are in this function.
815  */
816 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
817                                        union drbd_state ns, const char **warn_sync_abort)
818 {
819         enum drbd_fencing_p fp;
820         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
821
822         fp = FP_DONT_CARE;
823         if (get_ldev(mdev)) {
824                 fp = mdev->ldev->dc.fencing;
825                 put_ldev(mdev);
826         }
827
828         /* Disallow Network errors to configure a device's network part */
829         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
830             os.conn <= C_DISCONNECTING)
831                 ns.conn = os.conn;
832
833         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
834          * If you try to go into some Sync* state, that shall fail (elsewhere). */
835         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
836             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
837                 ns.conn = os.conn;
838
839         /* we cannot fail (again) if we already detached */
840         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
841                 ns.disk = D_DISKLESS;
842
843         /* if we are only D_ATTACHING yet,
844          * we can (and should) go directly to D_DISKLESS. */
845         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
846                 ns.disk = D_DISKLESS;
847
848         /* After C_DISCONNECTING only C_STANDALONE may follow */
849         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
850                 ns.conn = os.conn;
851
852         if (ns.conn < C_CONNECTED) {
853                 ns.peer_isp = 0;
854                 ns.peer = R_UNKNOWN;
855                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
856                         ns.pdsk = D_UNKNOWN;
857         }
858
859         /* Clear the aftr_isp when becoming unconfigured */
860         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
861                 ns.aftr_isp = 0;
862
863         /* Abort resync if a disk fails/detaches */
864         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
865             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
866                 if (warn_sync_abort)
867                         *warn_sync_abort =
868                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
869                                 "Online-verify" : "Resync";
870                 ns.conn = C_CONNECTED;
871         }
872
873         /* Connection breaks down before we finished "Negotiating" */
874         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
875             get_ldev_if_state(mdev, D_NEGOTIATING)) {
876                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
877                         ns.disk = mdev->new_state_tmp.disk;
878                         ns.pdsk = mdev->new_state_tmp.pdsk;
879                 } else {
880                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
881                         ns.disk = D_DISKLESS;
882                         ns.pdsk = D_UNKNOWN;
883                 }
884                 put_ldev(mdev);
885         }
886
887         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
888         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
889                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
890                         ns.disk = D_UP_TO_DATE;
891                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
892                         ns.pdsk = D_UP_TO_DATE;
893         }
894
895         /* Implications of the connection stat on the disk states */
896         disk_min = D_DISKLESS;
897         disk_max = D_UP_TO_DATE;
898         pdsk_min = D_INCONSISTENT;
899         pdsk_max = D_UNKNOWN;
900         switch ((enum drbd_conns)ns.conn) {
901         case C_WF_BITMAP_T:
902         case C_PAUSED_SYNC_T:
903         case C_STARTING_SYNC_T:
904         case C_WF_SYNC_UUID:
905         case C_BEHIND:
906                 disk_min = D_INCONSISTENT;
907                 disk_max = D_OUTDATED;
908                 pdsk_min = D_UP_TO_DATE;
909                 pdsk_max = D_UP_TO_DATE;
910                 break;
911         case C_VERIFY_S:
912         case C_VERIFY_T:
913                 disk_min = D_UP_TO_DATE;
914                 disk_max = D_UP_TO_DATE;
915                 pdsk_min = D_UP_TO_DATE;
916                 pdsk_max = D_UP_TO_DATE;
917                 break;
918         case C_CONNECTED:
919                 disk_min = D_DISKLESS;
920                 disk_max = D_UP_TO_DATE;
921                 pdsk_min = D_DISKLESS;
922                 pdsk_max = D_UP_TO_DATE;
923                 break;
924         case C_WF_BITMAP_S:
925         case C_PAUSED_SYNC_S:
926         case C_STARTING_SYNC_S:
927         case C_AHEAD:
928                 disk_min = D_UP_TO_DATE;
929                 disk_max = D_UP_TO_DATE;
930                 pdsk_min = D_INCONSISTENT;
931                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
932                 break;
933         case C_SYNC_TARGET:
934                 disk_min = D_INCONSISTENT;
935                 disk_max = D_INCONSISTENT;
936                 pdsk_min = D_UP_TO_DATE;
937                 pdsk_max = D_UP_TO_DATE;
938                 break;
939         case C_SYNC_SOURCE:
940                 disk_min = D_UP_TO_DATE;
941                 disk_max = D_UP_TO_DATE;
942                 pdsk_min = D_INCONSISTENT;
943                 pdsk_max = D_INCONSISTENT;
944                 break;
945         case C_STANDALONE:
946         case C_DISCONNECTING:
947         case C_UNCONNECTED:
948         case C_TIMEOUT:
949         case C_BROKEN_PIPE:
950         case C_NETWORK_FAILURE:
951         case C_PROTOCOL_ERROR:
952         case C_TEAR_DOWN:
953         case C_WF_CONNECTION:
954         case C_WF_REPORT_PARAMS:
955         case C_MASK:
956                 break;
957         }
958         if (ns.disk > disk_max)
959                 ns.disk = disk_max;
960
961         if (ns.disk < disk_min) {
962                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
963                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
964                 ns.disk = disk_min;
965         }
966         if (ns.pdsk > pdsk_max)
967                 ns.pdsk = pdsk_max;
968
969         if (ns.pdsk < pdsk_min) {
970                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
971                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
972                 ns.pdsk = pdsk_min;
973         }
974
975         if (fp == FP_STONITH &&
976             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
977             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
978                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
979
980         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
981             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
982             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
983                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
984
985         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
986                 if (ns.conn == C_SYNC_SOURCE)
987                         ns.conn = C_PAUSED_SYNC_S;
988                 if (ns.conn == C_SYNC_TARGET)
989                         ns.conn = C_PAUSED_SYNC_T;
990         } else {
991                 if (ns.conn == C_PAUSED_SYNC_S)
992                         ns.conn = C_SYNC_SOURCE;
993                 if (ns.conn == C_PAUSED_SYNC_T)
994                         ns.conn = C_SYNC_TARGET;
995         }
996
997         return ns;
998 }
999
1000 /* helper for __drbd_set_state */
1001 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1002 {
1003         if (mdev->agreed_pro_version < 90)
1004                 mdev->ov_start_sector = 0;
1005         mdev->rs_total = drbd_bm_bits(mdev);
1006         mdev->ov_position = 0;
1007         if (cs == C_VERIFY_T) {
1008                 /* starting online verify from an arbitrary position
1009                  * does not fit well into the existing protocol.
1010                  * on C_VERIFY_T, we initialize ov_left and friends
1011                  * implicitly in receive_DataRequest once the
1012                  * first P_OV_REQUEST is received */
1013                 mdev->ov_start_sector = ~(sector_t)0;
1014         } else {
1015                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1016                 if (bit >= mdev->rs_total) {
1017                         mdev->ov_start_sector =
1018                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1019                         mdev->rs_total = 1;
1020                 } else
1021                         mdev->rs_total -= bit;
1022                 mdev->ov_position = mdev->ov_start_sector;
1023         }
1024         mdev->ov_left = mdev->rs_total;
1025 }
1026
1027 static void drbd_resume_al(struct drbd_conf *mdev)
1028 {
1029         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1030                 dev_info(DEV, "Resumed AL updates\n");
1031 }
1032
1033 /**
1034  * __drbd_set_state() - Set a new DRBD state
1035  * @mdev:       DRBD device.
1036  * @ns:         new state.
1037  * @flags:      Flags
1038  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1039  *
1040  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1041  */
1042 enum drbd_state_rv
1043 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1044                  enum chg_state_flags flags, struct completion *done)
1045 {
1046         union drbd_state os;
1047         enum drbd_state_rv rv = SS_SUCCESS;
1048         const char *warn_sync_abort = NULL;
1049         struct after_state_chg_work *ascw;
1050
1051         os = mdev->state;
1052
1053         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1054
1055         if (ns.i == os.i)
1056                 return SS_NOTHING_TO_DO;
1057
1058         if (!(flags & CS_HARD)) {
1059                 /*  pre-state-change checks ; only look at ns  */
1060                 /* See drbd_state_sw_errors in drbd_strings.c */
1061
1062                 rv = is_valid_state(mdev, ns);
1063                 if (rv < SS_SUCCESS) {
1064                         /* If the old state was illegal as well, then let
1065                            this happen...*/
1066
1067                         if (is_valid_state(mdev, os) == rv)
1068                                 rv = is_valid_state_transition(mdev, ns, os);
1069                 } else
1070                         rv = is_valid_state_transition(mdev, ns, os);
1071         }
1072
1073         if (rv < SS_SUCCESS) {
1074                 if (flags & CS_VERBOSE)
1075                         print_st_err(mdev, os, ns, rv);
1076                 return rv;
1077         }
1078
1079         if (warn_sync_abort)
1080                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1081
1082         {
1083         char *pbp, pb[300];
1084         pbp = pb;
1085         *pbp = 0;
1086         if (ns.role != os.role)
1087                 pbp += sprintf(pbp, "role( %s -> %s ) ",
1088                                drbd_role_str(os.role),
1089                                drbd_role_str(ns.role));
1090         if (ns.peer != os.peer)
1091                 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1092                                drbd_role_str(os.peer),
1093                                drbd_role_str(ns.peer));
1094         if (ns.conn != os.conn)
1095                 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1096                                drbd_conn_str(os.conn),
1097                                drbd_conn_str(ns.conn));
1098         if (ns.disk != os.disk)
1099                 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1100                                drbd_disk_str(os.disk),
1101                                drbd_disk_str(ns.disk));
1102         if (ns.pdsk != os.pdsk)
1103                 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1104                                drbd_disk_str(os.pdsk),
1105                                drbd_disk_str(ns.pdsk));
1106         if (is_susp(ns) != is_susp(os))
1107                 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1108                                is_susp(os),
1109                                is_susp(ns));
1110         if (ns.aftr_isp != os.aftr_isp)
1111                 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1112                                os.aftr_isp,
1113                                ns.aftr_isp);
1114         if (ns.peer_isp != os.peer_isp)
1115                 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1116                                os.peer_isp,
1117                                ns.peer_isp);
1118         if (ns.user_isp != os.user_isp)
1119                 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1120                                os.user_isp,
1121                                ns.user_isp);
1122         dev_info(DEV, "%s\n", pb);
1123         }
1124
1125         /* solve the race between becoming unconfigured,
1126          * worker doing the cleanup, and
1127          * admin reconfiguring us:
1128          * on (re)configure, first set CONFIG_PENDING,
1129          * then wait for a potentially exiting worker,
1130          * start the worker, and schedule one no_op.
1131          * then proceed with configuration.
1132          */
1133         if (ns.disk == D_DISKLESS &&
1134             ns.conn == C_STANDALONE &&
1135             ns.role == R_SECONDARY &&
1136             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1137                 set_bit(DEVICE_DYING, &mdev->flags);
1138
1139         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1140          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1141          * drbd_ldev_destroy() won't happen before our corresponding
1142          * after_state_ch works run, where we put_ldev again. */
1143         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1144             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1145                 atomic_inc(&mdev->local_cnt);
1146
1147         mdev->state = ns;
1148
1149         if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1150                 drbd_print_uuids(mdev, "attached to UUIDs");
1151
1152         wake_up(&mdev->misc_wait);
1153         wake_up(&mdev->state_wait);
1154
1155         /* aborted verify run. log the last position */
1156         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1157             ns.conn < C_CONNECTED) {
1158                 mdev->ov_start_sector =
1159                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1160                 dev_info(DEV, "Online Verify reached sector %llu\n",
1161                         (unsigned long long)mdev->ov_start_sector);
1162         }
1163
1164         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1165             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1166                 dev_info(DEV, "Syncer continues.\n");
1167                 mdev->rs_paused += (long)jiffies
1168                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1169                 if (ns.conn == C_SYNC_TARGET)
1170                         mod_timer(&mdev->resync_timer, jiffies);
1171         }
1172
1173         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1174             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1175                 dev_info(DEV, "Resync suspended\n");
1176                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1177         }
1178
1179         if (os.conn == C_CONNECTED &&
1180             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1181                 unsigned long now = jiffies;
1182                 int i;
1183
1184                 set_ov_position(mdev, ns.conn);
1185                 mdev->rs_start = now;
1186                 mdev->rs_last_events = 0;
1187                 mdev->rs_last_sect_ev = 0;
1188                 mdev->ov_last_oos_size = 0;
1189                 mdev->ov_last_oos_start = 0;
1190
1191                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1192                         mdev->rs_mark_left[i] = mdev->ov_left;
1193                         mdev->rs_mark_time[i] = now;
1194                 }
1195
1196                 drbd_rs_controller_reset(mdev);
1197
1198                 if (ns.conn == C_VERIFY_S) {
1199                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1200                                         (unsigned long long)mdev->ov_position);
1201                         mod_timer(&mdev->resync_timer, jiffies);
1202                 }
1203         }
1204
1205         if (get_ldev(mdev)) {
1206                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1207                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1208                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1209
1210                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1211                         mdf |= MDF_CRASHED_PRIMARY;
1212                 if (mdev->state.role == R_PRIMARY ||
1213                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1214                         mdf |= MDF_PRIMARY_IND;
1215                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1216                         mdf |= MDF_CONNECTED_IND;
1217                 if (mdev->state.disk > D_INCONSISTENT)
1218                         mdf |= MDF_CONSISTENT;
1219                 if (mdev->state.disk > D_OUTDATED)
1220                         mdf |= MDF_WAS_UP_TO_DATE;
1221                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1222                         mdf |= MDF_PEER_OUT_DATED;
1223                 if (mdf != mdev->ldev->md.flags) {
1224                         mdev->ldev->md.flags = mdf;
1225                         drbd_md_mark_dirty(mdev);
1226                 }
1227                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1228                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1229                 put_ldev(mdev);
1230         }
1231
1232         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1233         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1234             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1235                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1236
1237         /* Receiver should clean up itself */
1238         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1239                 drbd_thread_stop_nowait(&mdev->receiver);
1240
1241         /* Now the receiver finished cleaning up itself, it should die */
1242         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1243                 drbd_thread_stop_nowait(&mdev->receiver);
1244
1245         /* Upon network failure, we need to restart the receiver. */
1246         if (os.conn > C_TEAR_DOWN &&
1247             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1248                 drbd_thread_restart_nowait(&mdev->receiver);
1249
1250         /* Resume AL writing if we get a connection */
1251         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1252                 drbd_resume_al(mdev);
1253
1254         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1255         if (ascw) {
1256                 ascw->os = os;
1257                 ascw->ns = ns;
1258                 ascw->flags = flags;
1259                 ascw->w.cb = w_after_state_ch;
1260                 ascw->done = done;
1261                 drbd_queue_work(&mdev->data.work, &ascw->w);
1262         } else {
1263                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1264         }
1265
1266         return rv;
1267 }
1268
1269 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1270 {
1271         struct after_state_chg_work *ascw =
1272                 container_of(w, struct after_state_chg_work, w);
1273         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1274         if (ascw->flags & CS_WAIT_COMPLETE) {
1275                 D_ASSERT(ascw->done != NULL);
1276                 complete(ascw->done);
1277         }
1278         kfree(ascw);
1279
1280         return 1;
1281 }
1282
1283 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1284 {
1285         if (rv) {
1286                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1287                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1288                 return;
1289         }
1290
1291         switch (mdev->state.conn) {
1292         case C_STARTING_SYNC_T:
1293                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1294                 break;
1295         case C_STARTING_SYNC_S:
1296                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1297                 break;
1298         }
1299 }
1300
1301 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1302                 int (*io_fn)(struct drbd_conf *),
1303                 char *why, enum bm_flag flags)
1304 {
1305         int rv;
1306
1307         D_ASSERT(current == mdev->worker.task);
1308
1309         /* open coded non-blocking drbd_suspend_io(mdev); */
1310         set_bit(SUSPEND_IO, &mdev->flags);
1311
1312         drbd_bm_lock(mdev, why, flags);
1313         rv = io_fn(mdev);
1314         drbd_bm_unlock(mdev);
1315
1316         drbd_resume_io(mdev);
1317
1318         return rv;
1319 }
1320
1321 /**
1322  * after_state_ch() - Perform after state change actions that may sleep
1323  * @mdev:       DRBD device.
1324  * @os:         old state.
1325  * @ns:         new state.
1326  * @flags:      Flags
1327  */
1328 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1329                            union drbd_state ns, enum chg_state_flags flags)
1330 {
1331         enum drbd_fencing_p fp;
1332         enum drbd_req_event what = nothing;
1333         union drbd_state nsm = (union drbd_state){ .i = -1 };
1334
1335         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1336                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1337                 if (mdev->p_uuid)
1338                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1339         }
1340
1341         fp = FP_DONT_CARE;
1342         if (get_ldev(mdev)) {
1343                 fp = mdev->ldev->dc.fencing;
1344                 put_ldev(mdev);
1345         }
1346
1347         /* Inform userspace about the change... */
1348         drbd_bcast_state(mdev, ns);
1349
1350         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1351             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1352                 drbd_khelper(mdev, "pri-on-incon-degr");
1353
1354         /* Here we have the actions that are performed after a
1355            state change. This function might sleep */
1356
1357         nsm.i = -1;
1358         if (ns.susp_nod) {
1359                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1360                         what = resend;
1361
1362                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1363                         what = restart_frozen_disk_io;
1364
1365                 if (what != nothing)
1366                         nsm.susp_nod = 0;
1367         }
1368
1369         if (ns.susp_fen) {
1370                 /* case1: The outdate peer handler is successful: */
1371                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1372                         tl_clear(mdev);
1373                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1374                                 drbd_uuid_new_current(mdev);
1375                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1376                         }
1377                         spin_lock_irq(&mdev->req_lock);
1378                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1379                         spin_unlock_irq(&mdev->req_lock);
1380                 }
1381                 /* case2: The connection was established again: */
1382                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1383                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1384                         what = resend;
1385                         nsm.susp_fen = 0;
1386                 }
1387         }
1388
1389         if (what != nothing) {
1390                 spin_lock_irq(&mdev->req_lock);
1391                 _tl_restart(mdev, what);
1392                 nsm.i &= mdev->state.i;
1393                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1394                 spin_unlock_irq(&mdev->req_lock);
1395         }
1396
1397         /* Became sync source.  With protocol >= 96, we still need to send out
1398          * the sync uuid now. Need to do that before any drbd_send_state, or
1399          * the other side may go "paused sync" before receiving the sync uuids,
1400          * which is unexpected. */
1401         if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1402             (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1403             mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1404                 drbd_gen_and_send_sync_uuid(mdev);
1405                 put_ldev(mdev);
1406         }
1407
1408         /* Do not change the order of the if above and the two below... */
1409         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1410                 drbd_send_uuids(mdev);
1411                 drbd_send_state(mdev);
1412         }
1413         /* No point in queuing send_bitmap if we don't have a connection
1414          * anymore, so check also the _current_ state, not only the new state
1415          * at the time this work was queued. */
1416         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1417             mdev->state.conn == C_WF_BITMAP_S)
1418                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1419                                 "send_bitmap (WFBitMapS)",
1420                                 BM_LOCKED_TEST_ALLOWED);
1421
1422         /* Lost contact to peer's copy of the data */
1423         if ((os.pdsk >= D_INCONSISTENT &&
1424              os.pdsk != D_UNKNOWN &&
1425              os.pdsk != D_OUTDATED)
1426         &&  (ns.pdsk < D_INCONSISTENT ||
1427              ns.pdsk == D_UNKNOWN ||
1428              ns.pdsk == D_OUTDATED)) {
1429                 if (get_ldev(mdev)) {
1430                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1431                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1432                                 if (is_susp(mdev->state)) {
1433                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1434                                 } else {
1435                                         drbd_uuid_new_current(mdev);
1436                                         drbd_send_uuids(mdev);
1437                                 }
1438                         }
1439                         put_ldev(mdev);
1440                 }
1441         }
1442
1443         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1444                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1445                         drbd_uuid_new_current(mdev);
1446                         drbd_send_uuids(mdev);
1447                 }
1448
1449                 /* D_DISKLESS Peer becomes secondary */
1450                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1451                         /* We may still be Primary ourselves.
1452                          * No harm done if the bitmap still changes,
1453                          * redirtied pages will follow later. */
1454                         drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1455                                 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1456                 put_ldev(mdev);
1457         }
1458
1459         /* Write out all changed bits on demote.
1460          * Though, no need to da that just yet
1461          * if there is a resync going on still */
1462         if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1463                 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1464                 /* No changes to the bitmap expected this time, so assert that,
1465                  * even though no harm was done if it did change. */
1466                 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1467                                 "demote", BM_LOCKED_TEST_ALLOWED);
1468                 put_ldev(mdev);
1469         }
1470
1471         /* Last part of the attaching process ... */
1472         if (ns.conn >= C_CONNECTED &&
1473             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1474                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1475                 drbd_send_uuids(mdev);
1476                 drbd_send_state(mdev);
1477         }
1478
1479         /* We want to pause/continue resync, tell peer. */
1480         if (ns.conn >= C_CONNECTED &&
1481              ((os.aftr_isp != ns.aftr_isp) ||
1482               (os.user_isp != ns.user_isp)))
1483                 drbd_send_state(mdev);
1484
1485         /* In case one of the isp bits got set, suspend other devices. */
1486         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1487             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1488                 suspend_other_sg(mdev);
1489
1490         /* Make sure the peer gets informed about eventual state
1491            changes (ISP bits) while we were in WFReportParams. */
1492         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1493                 drbd_send_state(mdev);
1494
1495         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1496                 drbd_send_state(mdev);
1497
1498         /* We are in the progress to start a full sync... */
1499         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1500             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1501                 /* no other bitmap changes expected during this phase */
1502                 drbd_queue_bitmap_io(mdev,
1503                         &drbd_bmio_set_n_write, &abw_start_sync,
1504                         "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1505
1506         /* We are invalidating our self... */
1507         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1508             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1509                 /* other bitmap operation expected during this phase */
1510                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1511                         "set_n_write from invalidate", BM_LOCKED_MASK);
1512
1513         /* first half of local IO error, failure to attach,
1514          * or administrative detach */
1515         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1516                 enum drbd_io_error_p eh;
1517                 int was_io_error;
1518                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1519                  * our cleanup here with the transition to D_DISKLESS,
1520                  * so it is safe to dreference ldev here. */
1521                 eh = mdev->ldev->dc.on_io_error;
1522                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1523
1524                 /* current state still has to be D_FAILED,
1525                  * there is only one way out: to D_DISKLESS,
1526                  * and that may only happen after our put_ldev below. */
1527                 if (mdev->state.disk != D_FAILED)
1528                         dev_err(DEV,
1529                                 "ASSERT FAILED: disk is %s during detach\n",
1530                                 drbd_disk_str(mdev->state.disk));
1531
1532                 if (drbd_send_state(mdev))
1533                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1534                 else
1535                         dev_err(DEV, "Sending state for detaching disk failed\n");
1536
1537                 drbd_rs_cancel_all(mdev);
1538
1539                 /* In case we want to get something to stable storage still,
1540                  * this may be the last chance.
1541                  * Following put_ldev may transition to D_DISKLESS. */
1542                 drbd_md_sync(mdev);
1543                 put_ldev(mdev);
1544
1545                 if (was_io_error && eh == EP_CALL_HELPER)
1546                         drbd_khelper(mdev, "local-io-error");
1547         }
1548
1549         /* second half of local IO error, failure to attach,
1550          * or administrative detach,
1551          * after local_cnt references have reached zero again */
1552         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1553                 /* We must still be diskless,
1554                  * re-attach has to be serialized with this! */
1555                 if (mdev->state.disk != D_DISKLESS)
1556                         dev_err(DEV,
1557                                 "ASSERT FAILED: disk is %s while going diskless\n",
1558                                 drbd_disk_str(mdev->state.disk));
1559
1560                 mdev->rs_total = 0;
1561                 mdev->rs_failed = 0;
1562                 atomic_set(&mdev->rs_pending_cnt, 0);
1563
1564                 if (drbd_send_state(mdev))
1565                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1566                 /* corresponding get_ldev in __drbd_set_state
1567                  * this may finally trigger drbd_ldev_destroy. */
1568                 put_ldev(mdev);
1569         }
1570
1571         /* Notify peer that I had a local IO error, and did not detached.. */
1572         if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1573                 drbd_send_state(mdev);
1574
1575         /* Disks got bigger while they were detached */
1576         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1577             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1578                 if (ns.conn == C_CONNECTED)
1579                         resync_after_online_grow(mdev);
1580         }
1581
1582         /* A resync finished or aborted, wake paused devices... */
1583         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1584             (os.peer_isp && !ns.peer_isp) ||
1585             (os.user_isp && !ns.user_isp))
1586                 resume_next_sg(mdev);
1587
1588         /* sync target done with resync.  Explicitly notify peer, even though
1589          * it should (at least for non-empty resyncs) already know itself. */
1590         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1591                 drbd_send_state(mdev);
1592
1593         /* This triggers bitmap writeout of potentially still unwritten pages
1594          * if the resync finished cleanly, or aborted because of peer disk
1595          * failure, or because of connection loss.
1596          * For resync aborted because of local disk failure, we cannot do
1597          * any bitmap writeout anymore.
1598          * No harm done if some bits change during this phase.
1599          */
1600         if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1601                 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1602                         "write from resync_finished", BM_LOCKED_SET_ALLOWED);
1603                 put_ldev(mdev);
1604         }
1605
1606         /* free tl_hash if we Got thawed and are C_STANDALONE */
1607         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1608                 drbd_free_tl_hash(mdev);
1609
1610         /* Upon network connection, we need to start the receiver */
1611         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1612                 drbd_thread_start(&mdev->receiver);
1613
1614         /* Terminate worker thread if we are unconfigured - it will be
1615            restarted as needed... */
1616         if (ns.disk == D_DISKLESS &&
1617             ns.conn == C_STANDALONE &&
1618             ns.role == R_SECONDARY) {
1619                 if (os.aftr_isp != ns.aftr_isp)
1620                         resume_next_sg(mdev);
1621                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1622                 if (test_bit(DEVICE_DYING, &mdev->flags))
1623                         drbd_thread_stop_nowait(&mdev->worker);
1624         }
1625
1626         drbd_md_sync(mdev);
1627 }
1628
1629
1630 static int drbd_thread_setup(void *arg)
1631 {
1632         struct drbd_thread *thi = (struct drbd_thread *) arg;
1633         struct drbd_conf *mdev = thi->mdev;
1634         unsigned long flags;
1635         int retval;
1636
1637 restart:
1638         retval = thi->function(thi);
1639
1640         spin_lock_irqsave(&thi->t_lock, flags);
1641
1642         /* if the receiver has been "Exiting", the last thing it did
1643          * was set the conn state to "StandAlone",
1644          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1645          * and receiver thread will be "started".
1646          * drbd_thread_start needs to set "Restarting" in that case.
1647          * t_state check and assignment needs to be within the same spinlock,
1648          * so either thread_start sees Exiting, and can remap to Restarting,
1649          * or thread_start see None, and can proceed as normal.
1650          */
1651
1652         if (thi->t_state == Restarting) {
1653                 dev_info(DEV, "Restarting %s\n", current->comm);
1654                 thi->t_state = Running;
1655                 spin_unlock_irqrestore(&thi->t_lock, flags);
1656                 goto restart;
1657         }
1658
1659         thi->task = NULL;
1660         thi->t_state = None;
1661         smp_mb();
1662         complete(&thi->stop);
1663         spin_unlock_irqrestore(&thi->t_lock, flags);
1664
1665         dev_info(DEV, "Terminating %s\n", current->comm);
1666
1667         /* Release mod reference taken when thread was started */
1668         module_put(THIS_MODULE);
1669         return retval;
1670 }
1671
1672 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1673                       int (*func) (struct drbd_thread *))
1674 {
1675         spin_lock_init(&thi->t_lock);
1676         thi->task    = NULL;
1677         thi->t_state = None;
1678         thi->function = func;
1679         thi->mdev = mdev;
1680 }
1681
1682 int drbd_thread_start(struct drbd_thread *thi)
1683 {
1684         struct drbd_conf *mdev = thi->mdev;
1685         struct task_struct *nt;
1686         unsigned long flags;
1687
1688         const char *me =
1689                 thi == &mdev->receiver ? "receiver" :
1690                 thi == &mdev->asender  ? "asender"  :
1691                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1692
1693         /* is used from state engine doing drbd_thread_stop_nowait,
1694          * while holding the req lock irqsave */
1695         spin_lock_irqsave(&thi->t_lock, flags);
1696
1697         switch (thi->t_state) {
1698         case None:
1699                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1700                                 me, current->comm, current->pid);
1701
1702                 /* Get ref on module for thread - this is released when thread exits */
1703                 if (!try_module_get(THIS_MODULE)) {
1704                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1705                         spin_unlock_irqrestore(&thi->t_lock, flags);
1706                         return false;
1707                 }
1708
1709                 init_completion(&thi->stop);
1710                 D_ASSERT(thi->task == NULL);
1711                 thi->reset_cpu_mask = 1;
1712                 thi->t_state = Running;
1713                 spin_unlock_irqrestore(&thi->t_lock, flags);
1714                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1715
1716                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1717                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1718
1719                 if (IS_ERR(nt)) {
1720                         dev_err(DEV, "Couldn't start thread\n");
1721
1722                         module_put(THIS_MODULE);
1723                         return false;
1724                 }
1725                 spin_lock_irqsave(&thi->t_lock, flags);
1726                 thi->task = nt;
1727                 thi->t_state = Running;
1728                 spin_unlock_irqrestore(&thi->t_lock, flags);
1729                 wake_up_process(nt);
1730                 break;
1731         case Exiting:
1732                 thi->t_state = Restarting;
1733                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1734                                 me, current->comm, current->pid);
1735                 /* fall through */
1736         case Running:
1737         case Restarting:
1738         default:
1739                 spin_unlock_irqrestore(&thi->t_lock, flags);
1740                 break;
1741         }
1742
1743         return true;
1744 }
1745
1746
1747 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1748 {
1749         unsigned long flags;
1750
1751         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1752
1753         /* may be called from state engine, holding the req lock irqsave */
1754         spin_lock_irqsave(&thi->t_lock, flags);
1755
1756         if (thi->t_state == None) {
1757                 spin_unlock_irqrestore(&thi->t_lock, flags);
1758                 if (restart)
1759                         drbd_thread_start(thi);
1760                 return;
1761         }
1762
1763         if (thi->t_state != ns) {
1764                 if (thi->task == NULL) {
1765                         spin_unlock_irqrestore(&thi->t_lock, flags);
1766                         return;
1767                 }
1768
1769                 thi->t_state = ns;
1770                 smp_mb();
1771                 init_completion(&thi->stop);
1772                 if (thi->task != current)
1773                         force_sig(DRBD_SIGKILL, thi->task);
1774
1775         }
1776
1777         spin_unlock_irqrestore(&thi->t_lock, flags);
1778
1779         if (wait)
1780                 wait_for_completion(&thi->stop);
1781 }
1782
1783 #ifdef CONFIG_SMP
1784 /**
1785  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1786  * @mdev:       DRBD device.
1787  *
1788  * Forces all threads of a device onto the same CPU. This is beneficial for
1789  * DRBD's performance. May be overwritten by user's configuration.
1790  */
1791 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1792 {
1793         int ord, cpu;
1794
1795         /* user override. */
1796         if (cpumask_weight(mdev->cpu_mask))
1797                 return;
1798
1799         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1800         for_each_online_cpu(cpu) {
1801                 if (ord-- == 0) {
1802                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1803                         return;
1804                 }
1805         }
1806         /* should not be reached */
1807         cpumask_setall(mdev->cpu_mask);
1808 }
1809
1810 /**
1811  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1812  * @mdev:       DRBD device.
1813  *
1814  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1815  * prematurely.
1816  */
1817 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1818 {
1819         struct task_struct *p = current;
1820         struct drbd_thread *thi =
1821                 p == mdev->asender.task  ? &mdev->asender  :
1822                 p == mdev->receiver.task ? &mdev->receiver :
1823                 p == mdev->worker.task   ? &mdev->worker   :
1824                 NULL;
1825         ERR_IF(thi == NULL)
1826                 return;
1827         if (!thi->reset_cpu_mask)
1828                 return;
1829         thi->reset_cpu_mask = 0;
1830         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1831 }
1832 #endif
1833
1834 /* the appropriate socket mutex must be held already */
1835 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1836                           enum drbd_packets cmd, struct p_header80 *h,
1837                           size_t size, unsigned msg_flags)
1838 {
1839         int sent, ok;
1840
1841         ERR_IF(!h) return false;
1842         ERR_IF(!size) return false;
1843
1844         h->magic   = BE_DRBD_MAGIC;
1845         h->command = cpu_to_be16(cmd);
1846         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1847
1848         sent = drbd_send(mdev, sock, h, size, msg_flags);
1849
1850         ok = (sent == size);
1851         if (!ok && !signal_pending(current))
1852                 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1853                     cmdname(cmd), (int)size, sent);
1854         return ok;
1855 }
1856
1857 /* don't pass the socket. we may only look at it
1858  * when we hold the appropriate socket mutex.
1859  */
1860 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1861                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1862 {
1863         int ok = 0;
1864         struct socket *sock;
1865
1866         if (use_data_socket) {
1867                 mutex_lock(&mdev->data.mutex);
1868                 sock = mdev->data.socket;
1869         } else {
1870                 mutex_lock(&mdev->meta.mutex);
1871                 sock = mdev->meta.socket;
1872         }
1873
1874         /* drbd_disconnect() could have called drbd_free_sock()
1875          * while we were waiting in down()... */
1876         if (likely(sock != NULL))
1877                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1878
1879         if (use_data_socket)
1880                 mutex_unlock(&mdev->data.mutex);
1881         else
1882                 mutex_unlock(&mdev->meta.mutex);
1883         return ok;
1884 }
1885
1886 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1887                    size_t size)
1888 {
1889         struct p_header80 h;
1890         int ok;
1891
1892         h.magic   = BE_DRBD_MAGIC;
1893         h.command = cpu_to_be16(cmd);
1894         h.length  = cpu_to_be16(size);
1895
1896         if (!drbd_get_data_sock(mdev))
1897                 return 0;
1898
1899         ok = (sizeof(h) ==
1900                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1901         ok = ok && (size ==
1902                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1903
1904         drbd_put_data_sock(mdev);
1905
1906         return ok;
1907 }
1908
1909 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1910 {
1911         struct p_rs_param_95 *p;
1912         struct socket *sock;
1913         int size, rv;
1914         const int apv = mdev->agreed_pro_version;
1915
1916         size = apv <= 87 ? sizeof(struct p_rs_param)
1917                 : apv == 88 ? sizeof(struct p_rs_param)
1918                         + strlen(mdev->sync_conf.verify_alg) + 1
1919                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1920                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1921
1922         /* used from admin command context and receiver/worker context.
1923          * to avoid kmalloc, grab the socket right here,
1924          * then use the pre-allocated sbuf there */
1925         mutex_lock(&mdev->data.mutex);
1926         sock = mdev->data.socket;
1927
1928         if (likely(sock != NULL)) {
1929                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1930
1931                 p = &mdev->data.sbuf.rs_param_95;
1932
1933                 /* initialize verify_alg and csums_alg */
1934                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1935
1936                 p->rate = cpu_to_be32(sc->rate);
1937                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1938                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1939                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1940                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1941
1942                 if (apv >= 88)
1943                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1944                 if (apv >= 89)
1945                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1946
1947                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1948         } else
1949                 rv = 0; /* not ok */
1950
1951         mutex_unlock(&mdev->data.mutex);
1952
1953         return rv;
1954 }
1955
1956 int drbd_send_protocol(struct drbd_conf *mdev)
1957 {
1958         struct p_protocol *p;
1959         int size, cf, rv;
1960
1961         size = sizeof(struct p_protocol);
1962
1963         if (mdev->agreed_pro_version >= 87)
1964                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1965
1966         /* we must not recurse into our own queue,
1967          * as that is blocked during handshake */
1968         p = kmalloc(size, GFP_NOIO);
1969         if (p == NULL)
1970                 return 0;
1971
1972         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1973         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1974         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1975         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1976         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1977
1978         cf = 0;
1979         if (mdev->net_conf->want_lose)
1980                 cf |= CF_WANT_LOSE;
1981         if (mdev->net_conf->dry_run) {
1982                 if (mdev->agreed_pro_version >= 92)
1983                         cf |= CF_DRY_RUN;
1984                 else {
1985                         dev_err(DEV, "--dry-run is not supported by peer");
1986                         kfree(p);
1987                         return -1;
1988                 }
1989         }
1990         p->conn_flags    = cpu_to_be32(cf);
1991
1992         if (mdev->agreed_pro_version >= 87)
1993                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1994
1995         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1996                            (struct p_header80 *)p, size);
1997         kfree(p);
1998         return rv;
1999 }
2000
2001 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2002 {
2003         struct p_uuids p;
2004         int i;
2005
2006         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2007                 return 1;
2008
2009         for (i = UI_CURRENT; i < UI_SIZE; i++)
2010                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2011
2012         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2013         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2014         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2015         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2016         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2017         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2018
2019         put_ldev(mdev);
2020
2021         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2022                              (struct p_header80 *)&p, sizeof(p));
2023 }
2024
2025 int drbd_send_uuids(struct drbd_conf *mdev)
2026 {
2027         return _drbd_send_uuids(mdev, 0);
2028 }
2029
2030 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2031 {
2032         return _drbd_send_uuids(mdev, 8);
2033 }
2034
2035 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2036 {
2037         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2038                 u64 *uuid = mdev->ldev->md.uuid;
2039                 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2040                      text,
2041                      (unsigned long long)uuid[UI_CURRENT],
2042                      (unsigned long long)uuid[UI_BITMAP],
2043                      (unsigned long long)uuid[UI_HISTORY_START],
2044                      (unsigned long long)uuid[UI_HISTORY_END]);
2045                 put_ldev(mdev);
2046         } else {
2047                 dev_info(DEV, "%s effective data uuid: %016llX\n",
2048                                 text,
2049                                 (unsigned long long)mdev->ed_uuid);
2050         }
2051 }
2052
2053 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2054 {
2055         struct p_rs_uuid p;
2056         u64 uuid;
2057
2058         D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2059
2060         uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2061         drbd_uuid_set(mdev, UI_BITMAP, uuid);
2062         drbd_print_uuids(mdev, "updated sync UUID");
2063         drbd_md_sync(mdev);
2064         p.uuid = cpu_to_be64(uuid);
2065
2066         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2067                              (struct p_header80 *)&p, sizeof(p));
2068 }
2069
2070 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2071 {
2072         struct p_sizes p;
2073         sector_t d_size, u_size;
2074         int q_order_type, max_bio_size;
2075         int ok;
2076
2077         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2078                 D_ASSERT(mdev->ldev->backing_bdev);
2079                 d_size = drbd_get_max_capacity(mdev->ldev);
2080                 u_size = mdev->ldev->dc.disk_size;
2081                 q_order_type = drbd_queue_order_type(mdev);
2082                 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083                 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2084                 put_ldev(mdev);
2085         } else {
2086                 d_size = 0;
2087                 u_size = 0;
2088                 q_order_type = QUEUE_ORDERED_NONE;
2089                 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2090         }
2091
2092         p.d_size = cpu_to_be64(d_size);
2093         p.u_size = cpu_to_be64(u_size);
2094         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2095         p.max_bio_size = cpu_to_be32(max_bio_size);
2096         p.queue_order_type = cpu_to_be16(q_order_type);
2097         p.dds_flags = cpu_to_be16(flags);
2098
2099         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2100                            (struct p_header80 *)&p, sizeof(p));
2101         return ok;
2102 }
2103
2104 /**
2105  * drbd_send_state() - Sends the drbd state to the peer
2106  * @mdev:       DRBD device.
2107  */
2108 int drbd_send_state(struct drbd_conf *mdev)
2109 {
2110         struct socket *sock;
2111         struct p_state p;
2112         int ok = 0;
2113
2114         /* Grab state lock so we wont send state if we're in the middle
2115          * of a cluster wide state change on another thread */
2116         drbd_state_lock(mdev);
2117
2118         mutex_lock(&mdev->data.mutex);
2119
2120         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2121         sock = mdev->data.socket;
2122
2123         if (likely(sock != NULL)) {
2124                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2125                                     (struct p_header80 *)&p, sizeof(p), 0);
2126         }
2127
2128         mutex_unlock(&mdev->data.mutex);
2129
2130         drbd_state_unlock(mdev);
2131         return ok;
2132 }
2133
2134 int drbd_send_state_req(struct drbd_conf *mdev,
2135         union drbd_state mask, union drbd_state val)
2136 {
2137         struct p_req_state p;
2138
2139         p.mask    = cpu_to_be32(mask.i);
2140         p.val     = cpu_to_be32(val.i);
2141
2142         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2143                              (struct p_header80 *)&p, sizeof(p));
2144 }
2145
2146 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2147 {
2148         struct p_req_state_reply p;
2149
2150         p.retcode    = cpu_to_be32(retcode);
2151
2152         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2153                              (struct p_header80 *)&p, sizeof(p));
2154 }
2155
2156 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2157         struct p_compressed_bm *p,
2158         struct bm_xfer_ctx *c)
2159 {
2160         struct bitstream bs;
2161         unsigned long plain_bits;
2162         unsigned long tmp;
2163         unsigned long rl;
2164         unsigned len;
2165         unsigned toggle;
2166         int bits;
2167
2168         /* may we use this feature? */
2169         if ((mdev->sync_conf.use_rle == 0) ||
2170                 (mdev->agreed_pro_version < 90))
2171                         return 0;
2172
2173         if (c->bit_offset >= c->bm_bits)
2174                 return 0; /* nothing to do. */
2175
2176         /* use at most thus many bytes */
2177         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2178         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2179         /* plain bits covered in this code string */
2180         plain_bits = 0;
2181
2182         /* p->encoding & 0x80 stores whether the first run length is set.
2183          * bit offset is implicit.
2184          * start with toggle == 2 to be able to tell the first iteration */
2185         toggle = 2;
2186
2187         /* see how much plain bits we can stuff into one packet
2188          * using RLE and VLI. */
2189         do {
2190                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2191                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2192                 if (tmp == -1UL)
2193                         tmp = c->bm_bits;
2194                 rl = tmp - c->bit_offset;
2195
2196                 if (toggle == 2) { /* first iteration */
2197                         if (rl == 0) {
2198                                 /* the first checked bit was set,
2199                                  * store start value, */
2200                                 DCBP_set_start(p, 1);
2201                                 /* but skip encoding of zero run length */
2202                                 toggle = !toggle;
2203                                 continue;
2204                         }
2205                         DCBP_set_start(p, 0);
2206                 }
2207
2208                 /* paranoia: catch zero runlength.
2209                  * can only happen if bitmap is modified while we scan it. */
2210                 if (rl == 0) {
2211                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2212                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2213                         return -1;
2214                 }
2215
2216                 bits = vli_encode_bits(&bs, rl);
2217                 if (bits == -ENOBUFS) /* buffer full */
2218                         break;
2219                 if (bits <= 0) {
2220                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2221                         return 0;
2222                 }
2223
2224                 toggle = !toggle;
2225                 plain_bits += rl;
2226                 c->bit_offset = tmp;
2227         } while (c->bit_offset < c->bm_bits);
2228
2229         len = bs.cur.b - p->code + !!bs.cur.bit;
2230
2231         if (plain_bits < (len << 3)) {
2232                 /* incompressible with this method.
2233                  * we need to rewind both word and bit position. */
2234                 c->bit_offset -= plain_bits;
2235                 bm_xfer_ctx_bit_to_word_offset(c);
2236                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2237                 return 0;
2238         }
2239
2240         /* RLE + VLI was able to compress it just fine.
2241          * update c->word_offset. */
2242         bm_xfer_ctx_bit_to_word_offset(c);
2243
2244         /* store pad_bits */
2245         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2246
2247         return len;
2248 }
2249
2250 /**
2251  * send_bitmap_rle_or_plain
2252  *
2253  * Return 0 when done, 1 when another iteration is needed, and a negative error
2254  * code upon failure.
2255  */
2256 static int
2257 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2258                          struct p_header80 *h, struct bm_xfer_ctx *c)
2259 {
2260         struct p_compressed_bm *p = (void*)h;
2261         unsigned long num_words;
2262         int len;
2263         int ok;
2264
2265         len = fill_bitmap_rle_bits(mdev, p, c);
2266
2267         if (len < 0)
2268                 return -EIO;
2269
2270         if (len) {
2271                 DCBP_set_code(p, RLE_VLI_Bits);
2272                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2273                         sizeof(*p) + len, 0);
2274
2275                 c->packets[0]++;
2276                 c->bytes[0] += sizeof(*p) + len;
2277
2278                 if (c->bit_offset >= c->bm_bits)
2279                         len = 0; /* DONE */
2280         } else {
2281                 /* was not compressible.
2282                  * send a buffer full of plain text bits instead. */
2283                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2284                 len = num_words * sizeof(long);
2285                 if (len)
2286                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2287                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2288                                    h, sizeof(struct p_header80) + len, 0);
2289                 c->word_offset += num_words;
2290                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2291
2292                 c->packets[1]++;
2293                 c->bytes[1] += sizeof(struct p_header80) + len;
2294
2295                 if (c->bit_offset > c->bm_bits)
2296                         c->bit_offset = c->bm_bits;
2297         }
2298         if (ok) {
2299                 if (len == 0) {
2300                         INFO_bm_xfer_stats(mdev, "send", c);
2301                         return 0;
2302                 } else
2303                         return 1;
2304         }
2305         return -EIO;
2306 }
2307
2308 /* See the comment at receive_bitmap() */
2309 int _drbd_send_bitmap(struct drbd_conf *mdev)
2310 {
2311         struct bm_xfer_ctx c;
2312         struct p_header80 *p;
2313         int err;
2314
2315         ERR_IF(!mdev->bitmap) return false;
2316
2317         /* maybe we should use some per thread scratch page,
2318          * and allocate that during initial device creation? */
2319         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2320         if (!p) {
2321                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2322                 return false;
2323         }
2324
2325         if (get_ldev(mdev)) {
2326                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2327                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2328                         drbd_bm_set_all(mdev);
2329                         if (drbd_bm_write(mdev)) {
2330                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2331                                  * but otherwise process as per normal - need to tell other
2332                                  * side that a full resync is required! */
2333                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2334                         } else {
2335                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2336                                 drbd_md_sync(mdev);
2337                         }
2338                 }
2339                 put_ldev(mdev);
2340         }
2341
2342         c = (struct bm_xfer_ctx) {
2343                 .bm_bits = drbd_bm_bits(mdev),
2344                 .bm_words = drbd_bm_words(mdev),
2345         };
2346
2347         do {
2348                 err = send_bitmap_rle_or_plain(mdev, p, &c);
2349         } while (err > 0);
2350
2351         free_page((unsigned long) p);
2352         return err == 0;
2353 }
2354
2355 int drbd_send_bitmap(struct drbd_conf *mdev)
2356 {
2357         int err;
2358
2359         if (!drbd_get_data_sock(mdev))
2360                 return -1;
2361         err = !_drbd_send_bitmap(mdev);
2362         drbd_put_data_sock(mdev);
2363         return err;
2364 }
2365
2366 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2367 {
2368         int ok;
2369         struct p_barrier_ack p;
2370
2371         p.barrier  = barrier_nr;
2372         p.set_size = cpu_to_be32(set_size);
2373
2374         if (mdev->state.conn < C_CONNECTED)
2375                 return false;
2376         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2377                         (struct p_header80 *)&p, sizeof(p));
2378         return ok;
2379 }
2380
2381 /**
2382  * _drbd_send_ack() - Sends an ack packet
2383  * @mdev:       DRBD device.
2384  * @cmd:        Packet command code.
2385  * @sector:     sector, needs to be in big endian byte order
2386  * @blksize:    size in byte, needs to be in big endian byte order
2387  * @block_id:   Id, big endian byte order
2388  */
2389 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2390                           u64 sector,
2391                           u32 blksize,
2392                           u64 block_id)
2393 {
2394         int ok;
2395         struct p_block_ack p;
2396
2397         p.sector   = sector;
2398         p.block_id = block_id;
2399         p.blksize  = blksize;
2400         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2401
2402         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2403                 return false;
2404         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2405                                 (struct p_header80 *)&p, sizeof(p));
2406         return ok;
2407 }
2408
2409 /* dp->sector and dp->block_id already/still in network byte order,
2410  * data_size is payload size according to dp->head,
2411  * and may need to be corrected for digest size. */
2412 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2413                      struct p_data *dp, int data_size)
2414 {
2415         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2416                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2417         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2418                               dp->block_id);
2419 }
2420
2421 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2422                      struct p_block_req *rp)
2423 {
2424         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2425 }
2426
2427 /**
2428  * drbd_send_ack() - Sends an ack packet
2429  * @mdev:       DRBD device.
2430  * @cmd:        Packet command code.
2431  * @e:          Epoch entry.
2432  */
2433 int drbd_send_ack(struct drbd_conf *mdev,
2434         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2435 {
2436         return _drbd_send_ack(mdev, cmd,
2437                               cpu_to_be64(e->sector),
2438                               cpu_to_be32(e->size),
2439                               e->block_id);
2440 }
2441
2442 /* This function misuses the block_id field to signal if the blocks
2443  * are is sync or not. */
2444 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2445                      sector_t sector, int blksize, u64 block_id)
2446 {
2447         return _drbd_send_ack(mdev, cmd,
2448                               cpu_to_be64(sector),
2449                               cpu_to_be32(blksize),
2450                               cpu_to_be64(block_id));
2451 }
2452
2453 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2454                        sector_t sector, int size, u64 block_id)
2455 {
2456         int ok;
2457         struct p_block_req p;
2458
2459         p.sector   = cpu_to_be64(sector);
2460         p.block_id = block_id;
2461         p.blksize  = cpu_to_be32(size);
2462
2463         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2464                                 (struct p_header80 *)&p, sizeof(p));
2465         return ok;
2466 }
2467
2468 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2469                             sector_t sector, int size,
2470                             void *digest, int digest_size,
2471                             enum drbd_packets cmd)
2472 {
2473         int ok;
2474         struct p_block_req p;
2475
2476         p.sector   = cpu_to_be64(sector);
2477         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2478         p.blksize  = cpu_to_be32(size);
2479
2480         p.head.magic   = BE_DRBD_MAGIC;
2481         p.head.command = cpu_to_be16(cmd);
2482         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2483
2484         mutex_lock(&mdev->data.mutex);
2485
2486         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2487         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2488
2489         mutex_unlock(&mdev->data.mutex);
2490
2491         return ok;
2492 }
2493
2494 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2495 {
2496         int ok;
2497         struct p_block_req p;
2498
2499         p.sector   = cpu_to_be64(sector);
2500         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2501         p.blksize  = cpu_to_be32(size);
2502
2503         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2504                            (struct p_header80 *)&p, sizeof(p));
2505         return ok;
2506 }
2507
2508 /* called on sndtimeo
2509  * returns false if we should retry,
2510  * true if we think connection is dead
2511  */
2512 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2513 {
2514         int drop_it;
2515         /* long elapsed = (long)(jiffies - mdev->last_received); */
2516
2517         drop_it =   mdev->meta.socket == sock
2518                 || !mdev->asender.task
2519                 || get_t_state(&mdev->asender) != Running
2520                 || mdev->state.conn < C_CONNECTED;
2521
2522         if (drop_it)
2523                 return true;
2524
2525         drop_it = !--mdev->ko_count;
2526         if (!drop_it) {
2527                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2528                        current->comm, current->pid, mdev->ko_count);
2529                 request_ping(mdev);
2530         }
2531
2532         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2533 }
2534
2535 /* The idea of sendpage seems to be to put some kind of reference
2536  * to the page into the skb, and to hand it over to the NIC. In
2537  * this process get_page() gets called.
2538  *
2539  * As soon as the page was really sent over the network put_page()
2540  * gets called by some part of the network layer. [ NIC driver? ]
2541  *
2542  * [ get_page() / put_page() increment/decrement the count. If count
2543  *   reaches 0 the page will be freed. ]
2544  *
2545  * This works nicely with pages from FSs.
2546  * But this means that in protocol A we might signal IO completion too early!
2547  *
2548  * In order not to corrupt data during a resync we must make sure
2549  * that we do not reuse our own buffer pages (EEs) to early, therefore
2550  * we have the net_ee list.
2551  *
2552  * XFS seems to have problems, still, it submits pages with page_count == 0!
2553  * As a workaround, we disable sendpage on pages
2554  * with page_count == 0 or PageSlab.
2555  */
2556 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2557                    int offset, size_t size, unsigned msg_flags)
2558 {
2559         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2560         kunmap(page);
2561         if (sent == size)
2562                 mdev->send_cnt += size>>9;
2563         return sent == size;
2564 }
2565
2566 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2567                     int offset, size_t size, unsigned msg_flags)
2568 {
2569         mm_segment_t oldfs = get_fs();
2570         int sent, ok;
2571         int len = size;
2572
2573         /* e.g. XFS meta- & log-data is in slab pages, which have a
2574          * page_count of 0 and/or have PageSlab() set.
2575          * we cannot use send_page for those, as that does get_page();
2576          * put_page(); and would cause either a VM_BUG directly, or
2577          * __page_cache_release a page that would actually still be referenced
2578          * by someone, leading to some obscure delayed Oops somewhere else. */
2579         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2580                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2581
2582         msg_flags |= MSG_NOSIGNAL;
2583         drbd_update_congested(mdev);
2584         set_fs(KERNEL_DS);
2585         do {
2586                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2587                                                         offset, len,
2588                                                         msg_flags);
2589                 if (sent == -EAGAIN) {
2590                         if (we_should_drop_the_connection(mdev,
2591                                                           mdev->data.socket))
2592                                 break;
2593                         else
2594                                 continue;
2595                 }
2596                 if (sent <= 0) {
2597                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2598                              __func__, (int)size, len, sent);
2599                         break;
2600                 }
2601                 len    -= sent;
2602                 offset += sent;
2603         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2604         set_fs(oldfs);
2605         clear_bit(NET_CONGESTED, &mdev->flags);
2606
2607         ok = (len == 0);
2608         if (likely(ok))
2609                 mdev->send_cnt += size>>9;
2610         return ok;
2611 }
2612
2613 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2614 {
2615         struct bio_vec *bvec;
2616         int i;
2617         /* hint all but last page with MSG_MORE */
2618         __bio_for_each_segment(bvec, bio, i, 0) {
2619                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2620                                      bvec->bv_offset, bvec->bv_len,
2621                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2622                         return 0;
2623         }
2624         return 1;
2625 }
2626
2627 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2628 {
2629         struct bio_vec *bvec;
2630         int i;
2631         /* hint all but last page with MSG_MORE */
2632         __bio_for_each_segment(bvec, bio, i, 0) {
2633                 if (!_drbd_send_page(mdev, bvec->bv_page,
2634                                      bvec->bv_offset, bvec->bv_len,
2635                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2636                         return 0;
2637         }
2638         return 1;
2639 }
2640
2641 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2642 {
2643         struct page *page = e->pages;
2644         unsigned len = e->size;
2645         /* hint all but last page with MSG_MORE */
2646         page_chain_for_each(page) {
2647                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2648                 if (!_drbd_send_page(mdev, page, 0, l,
2649                                 page_chain_next(page) ? MSG_MORE : 0))
2650                         return 0;
2651                 len -= l;
2652         }
2653         return 1;
2654 }
2655
2656 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2657 {
2658         if (mdev->agreed_pro_version >= 95)
2659                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2660                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2661                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2662                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2663         else
2664                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2665 }
2666
2667 /* Used to send write requests
2668  * R_PRIMARY -> Peer    (P_DATA)
2669  */
2670 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2671 {
2672         int ok = 1;
2673         struct p_data p;
2674         unsigned int dp_flags = 0;
2675         void *dgb;
2676         int dgs;
2677
2678         if (!drbd_get_data_sock(mdev))
2679                 return 0;
2680
2681         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2682                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2683
2684         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2685                 p.head.h80.magic   = BE_DRBD_MAGIC;
2686                 p.head.h80.command = cpu_to_be16(P_DATA);
2687                 p.head.h80.length  =
2688                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2689         } else {
2690                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2691                 p.head.h95.command = cpu_to_be16(P_DATA);
2692                 p.head.h95.length  =
2693                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2694         }
2695
2696         p.sector   = cpu_to_be64(req->sector);
2697         p.block_id = (unsigned long)req;
2698         p.seq_num  = cpu_to_be32(req->seq_num =
2699                                  atomic_add_return(1, &mdev->packet_seq));
2700
2701         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2702
2703         if (mdev->state.conn >= C_SYNC_SOURCE &&
2704             mdev->state.conn <= C_PAUSED_SYNC_T)
2705                 dp_flags |= DP_MAY_SET_IN_SYNC;
2706
2707         p.dp_flags = cpu_to_be32(dp_flags);
2708         set_bit(UNPLUG_REMOTE, &mdev->flags);
2709         ok = (sizeof(p) ==
2710                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2711         if (ok && dgs) {
2712                 dgb = mdev->int_dig_out;
2713                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2714                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2715         }
2716         if (ok) {
2717                 /* For protocol A, we have to memcpy the payload into
2718                  * socket buffers, as we may complete right away
2719                  * as soon as we handed it over to tcp, at which point the data
2720                  * pages may become invalid.
2721                  *
2722                  * For data-integrity enabled, we copy it as well, so we can be
2723                  * sure that even if the bio pages may still be modified, it
2724                  * won't change the data on the wire, thus if the digest checks
2725                  * out ok after sending on this side, but does not fit on the
2726                  * receiving side, we sure have detected corruption elsewhere.
2727                  */
2728                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2729                         ok = _drbd_send_bio(mdev, req->master_bio);
2730                 else
2731                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2732
2733                 /* double check digest, sometimes buffers have been modified in flight. */
2734                 if (dgs > 0 && dgs <= 64) {
2735                         /* 64 byte, 512 bit, is the largest digest size
2736                          * currently supported in kernel crypto. */
2737                         unsigned char digest[64];
2738                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2739                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2740                                 dev_warn(DEV,
2741                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2742                                         (unsigned long long)req->sector, req->size);
2743                         }
2744                 } /* else if (dgs > 64) {
2745                      ... Be noisy about digest too large ...
2746                 } */
2747         }
2748
2749         drbd_put_data_sock(mdev);
2750
2751         return ok;
2752 }
2753
2754 /* answer packet, used to send data back for read requests:
2755  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2756  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2757  */
2758 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2759                     struct drbd_epoch_entry *e)
2760 {
2761         int ok;
2762         struct p_data p;
2763         void *dgb;
2764         int dgs;
2765
2766         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2767                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2768
2769         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2770                 p.head.h80.magic   = BE_DRBD_MAGIC;
2771                 p.head.h80.command = cpu_to_be16(cmd);
2772                 p.head.h80.length  =
2773                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2774         } else {
2775                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2776                 p.head.h95.command = cpu_to_be16(cmd);
2777                 p.head.h95.length  =
2778                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2779         }
2780
2781         p.sector   = cpu_to_be64(e->sector);
2782         p.block_id = e->block_id;
2783         /* p.seq_num  = 0;    No sequence numbers here.. */
2784
2785         /* Only called by our kernel thread.
2786          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2787          * in response to admin command or module unload.
2788          */
2789         if (!drbd_get_data_sock(mdev))
2790                 return 0;
2791
2792         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2793         if (ok && dgs) {
2794                 dgb = mdev->int_dig_out;
2795                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2796                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2797         }
2798         if (ok)
2799                 ok = _drbd_send_zc_ee(mdev, e);
2800
2801         drbd_put_data_sock(mdev);
2802
2803         return ok;
2804 }
2805
2806 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2807 {
2808         struct p_block_desc p;
2809
2810         p.sector  = cpu_to_be64(req->sector);
2811         p.blksize = cpu_to_be32(req->size);
2812
2813         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2814 }
2815
2816 /*
2817   drbd_send distinguishes two cases:
2818
2819   Packets sent via the data socket "sock"
2820   and packets sent via the meta data socket "msock"
2821
2822                     sock                      msock
2823   -----------------+-------------------------+------------------------------
2824   timeout           conf.timeout / 2          conf.timeout / 2
2825   timeout action    send a ping via msock     Abort communication
2826                                               and close all sockets
2827 */
2828
2829 /*
2830  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2831  */
2832 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2833               void *buf, size_t size, unsigned msg_flags)
2834 {
2835         struct kvec iov;
2836         struct msghdr msg;
2837         int rv, sent = 0;
2838
2839         if (!sock)
2840                 return -1000;
2841
2842         /* THINK  if (signal_pending) return ... ? */
2843
2844         iov.iov_base = buf;
2845         iov.iov_len  = size;
2846
2847         msg.msg_name       = NULL;
2848         msg.msg_namelen    = 0;
2849         msg.msg_control    = NULL;
2850         msg.msg_controllen = 0;
2851         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2852
2853         if (sock == mdev->data.socket) {
2854                 mdev->ko_count = mdev->net_conf->ko_count;
2855                 drbd_update_congested(mdev);
2856         }
2857         do {
2858                 /* STRANGE
2859                  * tcp_sendmsg does _not_ use its size parameter at all ?
2860                  *
2861                  * -EAGAIN on timeout, -EINTR on signal.
2862                  */
2863 /* THINK
2864  * do we need to block DRBD_SIG if sock == &meta.socket ??
2865  * otherwise wake_asender() might interrupt some send_*Ack !
2866  */
2867                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2868                 if (rv == -EAGAIN) {
2869                         if (we_should_drop_the_connection(mdev, sock))
2870                                 break;
2871                         else
2872                                 continue;
2873                 }
2874                 D_ASSERT(rv != 0);
2875                 if (rv == -EINTR) {
2876                         flush_signals(current);
2877                         rv = 0;
2878                 }
2879                 if (rv < 0)
2880                         break;
2881                 sent += rv;
2882                 iov.iov_base += rv;
2883                 iov.iov_len  -= rv;
2884         } while (sent < size);
2885
2886         if (sock == mdev->data.socket)
2887                 clear_bit(NET_CONGESTED, &mdev->flags);
2888
2889         if (rv <= 0) {
2890                 if (rv != -EAGAIN) {
2891                         dev_err(DEV, "%s_sendmsg returned %d\n",
2892                             sock == mdev->meta.socket ? "msock" : "sock",
2893                             rv);
2894                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2895                 } else
2896                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2897         }
2898
2899         return sent;
2900 }
2901
2902 static int drbd_open(struct block_device *bdev, fmode_t mode)
2903 {
2904         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2905         unsigned long flags;
2906         int rv = 0;
2907
2908         mutex_lock(&drbd_main_mutex);
2909         spin_lock_irqsave(&mdev->req_lock, flags);
2910         /* to have a stable mdev->state.role
2911          * and no race with updating open_cnt */
2912
2913         if (mdev->state.role != R_PRIMARY) {
2914                 if (mode & FMODE_WRITE)
2915                         rv = -EROFS;
2916                 else if (!allow_oos)
2917                         rv = -EMEDIUMTYPE;
2918         }
2919
2920         if (!rv)
2921                 mdev->open_cnt++;
2922         spin_unlock_irqrestore(&mdev->req_lock, flags);
2923         mutex_unlock(&drbd_main_mutex);
2924
2925         return rv;
2926 }
2927
2928 static int drbd_release(struct gendisk *gd, fmode_t mode)
2929 {
2930         struct drbd_conf *mdev = gd->private_data;
2931         mutex_lock(&drbd_main_mutex);
2932         mdev->open_cnt--;
2933         mutex_unlock(&drbd_main_mutex);
2934         return 0;
2935 }
2936
2937 static void drbd_set_defaults(struct drbd_conf *mdev)
2938 {
2939         /* This way we get a compile error when sync_conf grows,
2940            and we forgot to initialize it here */
2941         mdev->sync_conf = (struct syncer_conf) {
2942                 /* .rate = */           DRBD_RATE_DEF,
2943                 /* .after = */          DRBD_AFTER_DEF,
2944                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2945                 /* .verify_alg = */     {}, 0,
2946                 /* .cpu_mask = */       {}, 0,
2947                 /* .csums_alg = */      {}, 0,
2948                 /* .use_rle = */        0,
2949                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2950                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2951                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2952                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2953                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2954                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2955         };
2956
2957         /* Have to use that way, because the layout differs between
2958            big endian and little endian */
2959         mdev->state = (union drbd_state) {
2960                 { .role = R_SECONDARY,
2961                   .peer = R_UNKNOWN,
2962                   .conn = C_STANDALONE,
2963                   .disk = D_DISKLESS,
2964                   .pdsk = D_UNKNOWN,
2965                   .susp = 0,
2966                   .susp_nod = 0,
2967                   .susp_fen = 0
2968                 } };
2969 }
2970
2971 void drbd_init_set_defaults(struct drbd_conf *mdev)
2972 {
2973         /* the memset(,0,) did most of this.
2974          * note: only assignments, no allocation in here */
2975
2976         drbd_set_defaults(mdev);
2977
2978         atomic_set(&mdev->ap_bio_cnt, 0);
2979         atomic_set(&mdev->ap_pending_cnt, 0);
2980         atomic_set(&mdev->rs_pending_cnt, 0);
2981         atomic_set(&mdev->unacked_cnt, 0);
2982         atomic_set(&mdev->local_cnt, 0);
2983         atomic_set(&mdev->net_cnt, 0);
2984         atomic_set(&mdev->packet_seq, 0);
2985         atomic_set(&mdev->pp_in_use, 0);
2986         atomic_set(&mdev->pp_in_use_by_net, 0);
2987         atomic_set(&mdev->rs_sect_in, 0);
2988         atomic_set(&mdev->rs_sect_ev, 0);
2989         atomic_set(&mdev->ap_in_flight, 0);
2990
2991         mutex_init(&mdev->md_io_mutex);
2992         mutex_init(&mdev->data.mutex);
2993         mutex_init(&mdev->meta.mutex);
2994         sema_init(&mdev->data.work.s, 0);
2995         sema_init(&mdev->meta.work.s, 0);
2996         mutex_init(&mdev->state_mutex);
2997
2998         spin_lock_init(&mdev->data.work.q_lock);
2999         spin_lock_init(&mdev->meta.work.q_lock);
3000
3001         spin_lock_init(&mdev->al_lock);
3002         spin_lock_init(&mdev->req_lock);
3003         spin_lock_init(&mdev->peer_seq_lock);
3004         spin_lock_init(&mdev->epoch_lock);
3005
3006         INIT_LIST_HEAD(&mdev->active_ee);
3007         INIT_LIST_HEAD(&mdev->sync_ee);
3008         INIT_LIST_HEAD(&mdev->done_ee);
3009         INIT_LIST_HEAD(&mdev->read_ee);
3010         INIT_LIST_HEAD(&mdev->net_ee);
3011         INIT_LIST_HEAD(&mdev->resync_reads);
3012         INIT_LIST_HEAD(&mdev->data.work.q);
3013         INIT_LIST_HEAD(&mdev->meta.work.q);
3014         INIT_LIST_HEAD(&mdev->resync_work.list);
3015         INIT_LIST_HEAD(&mdev->unplug_work.list);
3016         INIT_LIST_HEAD(&mdev->go_diskless.list);
3017         INIT_LIST_HEAD(&mdev->md_sync_work.list);
3018         INIT_LIST_HEAD(&mdev->start_resync_work.list);
3019         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3020
3021         mdev->resync_work.cb  = w_resync_timer;
3022         mdev->unplug_work.cb  = w_send_write_hint;
3023         mdev->go_diskless.cb  = w_go_diskless;
3024         mdev->md_sync_work.cb = w_md_sync;
3025         mdev->bm_io_work.w.cb = w_bitmap_io;
3026         mdev->start_resync_work.cb = w_start_resync;
3027         init_timer(&mdev->resync_timer);
3028         init_timer(&mdev->md_sync_timer);
3029         init_timer(&mdev->start_resync_timer);
3030         init_timer(&mdev->request_timer);
3031         mdev->resync_timer.function = resync_timer_fn;
3032         mdev->resync_timer.data = (unsigned long) mdev;
3033         mdev->md_sync_timer.function = md_sync_timer_fn;
3034         mdev->md_sync_timer.data = (unsigned long) mdev;
3035         mdev->start_resync_timer.function = start_resync_timer_fn;
3036         mdev->start_resync_timer.data = (unsigned long) mdev;
3037         mdev->request_timer.function = request_timer_fn;
3038         mdev->request_timer.data = (unsigned long) mdev;
3039
3040         init_waitqueue_head(&mdev->misc_wait);
3041         init_waitqueue_head(&mdev->state_wait);
3042         init_waitqueue_head(&mdev->net_cnt_wait);
3043         init_waitqueue_head(&mdev->ee_wait);
3044         init_waitqueue_head(&mdev->al_wait);
3045         init_waitqueue_head(&mdev->seq_wait);
3046
3047         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3048         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3049         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3050
3051         mdev->agreed_pro_version = PRO_VERSION_MAX;
3052         mdev->write_ordering = WO_bdev_flush;
3053         mdev->resync_wenr = LC_FREE;
3054         mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055         mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3056 }
3057
3058 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3059 {
3060         int i;