drbd: send_bitmap_rle_or_plain: Get rid of ugly and useless enum
[pandora-kernel.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91 #include <linux/moduleparam.h>
92 /* allow_open_on_secondary */
93 MODULE_PARM_DESC(allow_oos, "DONT USE!");
94 /* thanks to these macros, if compiled into the kernel (not-module),
95  * this becomes the boot parameter drbd.minor_count */
96 module_param(minor_count, uint, 0444);
97 module_param(disable_sendpage, bool, 0644);
98 module_param(allow_oos, bool, 0);
99 module_param(cn_idx, uint, 0444);
100 module_param(proc_details, int, 0644);
101
102 #ifdef CONFIG_DRBD_FAULT_INJECTION
103 int enable_faults;
104 int fault_rate;
105 static int fault_count;
106 int fault_devs;
107 /* bitmap of enabled faults */
108 module_param(enable_faults, int, 0664);
109 /* fault rate % value - applies to all enabled faults */
110 module_param(fault_rate, int, 0664);
111 /* count of faults inserted */
112 module_param(fault_count, int, 0664);
113 /* bitmap of devices to insert faults on */
114 module_param(fault_devs, int, 0644);
115 #endif
116
117 /* module parameter, defined */
118 unsigned int minor_count = 32;
119 int disable_sendpage;
120 int allow_oos;
121 unsigned int cn_idx = CN_IDX_DRBD;
122 int proc_details;       /* Detail level in proc drbd*/
123
124 /* Module parameter for setting the user mode helper program
125  * to run. Default is /sbin/drbdadm */
126 char usermode_helper[80] = "/sbin/drbdadm";
127
128 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
131  * as member "struct gendisk *vdisk;"
132  */
133 struct drbd_conf **minor_table;
134
135 struct kmem_cache *drbd_request_cache;
136 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
137 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
138 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
139 mempool_t *drbd_request_mempool;
140 mempool_t *drbd_ee_mempool;
141
142 /* I do not use a standard mempool, because:
143    1) I want to hand out the pre-allocated objects first.
144    2) I want to be able to interrupt sleeping allocation with a signal.
145    Note: This is a single linked list, the next pointer is the private
146          member of struct page.
147  */
148 struct page *drbd_pp_pool;
149 spinlock_t   drbd_pp_lock;
150 int          drbd_pp_vacant;
151 wait_queue_head_t drbd_pp_wait;
152
153 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
155 static const struct block_device_operations drbd_ops = {
156         .owner =   THIS_MODULE,
157         .open =    drbd_open,
158         .release = drbd_release,
159 };
160
161 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163 #ifdef __CHECKER__
164 /* When checking with sparse, and this is an inline function, sparse will
165    give tons of false positives. When this is a real functions sparse works.
166  */
167 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168 {
169         int io_allowed;
170
171         atomic_inc(&mdev->local_cnt);
172         io_allowed = (mdev->state.disk >= mins);
173         if (!io_allowed) {
174                 if (atomic_dec_and_test(&mdev->local_cnt))
175                         wake_up(&mdev->misc_wait);
176         }
177         return io_allowed;
178 }
179
180 #endif
181
182 /**
183  * DOC: The transfer log
184  *
185  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187  * of the list. There is always at least one &struct drbd_tl_epoch object.
188  *
189  * Each &struct drbd_tl_epoch has a circular double linked list of requests
190  * attached.
191  */
192 static int tl_init(struct drbd_conf *mdev)
193 {
194         struct drbd_tl_epoch *b;
195
196         /* during device minor initialization, we may well use GFP_KERNEL */
197         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198         if (!b)
199                 return 0;
200         INIT_LIST_HEAD(&b->requests);
201         INIT_LIST_HEAD(&b->w.list);
202         b->next = NULL;
203         b->br_number = 4711;
204         b->n_writes = 0;
205         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207         mdev->oldest_tle = b;
208         mdev->newest_tle = b;
209         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211         mdev->tl_hash = NULL;
212         mdev->tl_hash_s = 0;
213
214         return 1;
215 }
216
217 static void tl_cleanup(struct drbd_conf *mdev)
218 {
219         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221         kfree(mdev->oldest_tle);
222         mdev->oldest_tle = NULL;
223         kfree(mdev->unused_spare_tle);
224         mdev->unused_spare_tle = NULL;
225         kfree(mdev->tl_hash);
226         mdev->tl_hash = NULL;
227         mdev->tl_hash_s = 0;
228 }
229
230 /**
231  * _tl_add_barrier() - Adds a barrier to the transfer log
232  * @mdev:       DRBD device.
233  * @new:        Barrier to be added before the current head of the TL.
234  *
235  * The caller must hold the req_lock.
236  */
237 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238 {
239         struct drbd_tl_epoch *newest_before;
240
241         INIT_LIST_HEAD(&new->requests);
242         INIT_LIST_HEAD(&new->w.list);
243         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244         new->next = NULL;
245         new->n_writes = 0;
246
247         newest_before = mdev->newest_tle;
248         /* never send a barrier number == 0, because that is special-cased
249          * when using TCQ for our write ordering code */
250         new->br_number = (newest_before->br_number+1) ?: 1;
251         if (mdev->newest_tle != new) {
252                 mdev->newest_tle->next = new;
253                 mdev->newest_tle = new;
254         }
255 }
256
257 /**
258  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259  * @mdev:       DRBD device.
260  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261  * @set_size:   Expected number of requests before that barrier.
262  *
263  * In case the passed barrier_nr or set_size does not match the oldest
264  * &struct drbd_tl_epoch objects this function will cause a termination
265  * of the connection.
266  */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268                        unsigned int set_size)
269 {
270         struct drbd_tl_epoch *b, *nob; /* next old barrier */
271         struct list_head *le, *tle;
272         struct drbd_request *r;
273
274         spin_lock_irq(&mdev->req_lock);
275
276         b = mdev->oldest_tle;
277
278         /* first some paranoia code */
279         if (b == NULL) {
280                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281                         barrier_nr);
282                 goto bail;
283         }
284         if (b->br_number != barrier_nr) {
285                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286                         barrier_nr, b->br_number);
287                 goto bail;
288         }
289         if (b->n_writes != set_size) {
290                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291                         barrier_nr, set_size, b->n_writes);
292                 goto bail;
293         }
294
295         /* Clean up list of requests processed during current epoch */
296         list_for_each_safe(le, tle, &b->requests) {
297                 r = list_entry(le, struct drbd_request, tl_requests);
298                 _req_mod(r, barrier_acked);
299         }
300         /* There could be requests on the list waiting for completion
301            of the write to the local disk. To avoid corruptions of
302            slab's data structures we have to remove the lists head.
303
304            Also there could have been a barrier ack out of sequence, overtaking
305            the write acks - which would be a bug and violating write ordering.
306            To not deadlock in case we lose connection while such requests are
307            still pending, we need some way to find them for the
308            _req_mode(connection_lost_while_pending).
309
310            These have been list_move'd to the out_of_sequence_requests list in
311            _req_mod(, barrier_acked) above.
312            */
313         list_del_init(&b->requests);
314
315         nob = b->next;
316         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317                 _tl_add_barrier(mdev, b);
318                 if (nob)
319                         mdev->oldest_tle = nob;
320                 /* if nob == NULL b was the only barrier, and becomes the new
321                    barrier. Therefore mdev->oldest_tle points already to b */
322         } else {
323                 D_ASSERT(nob != NULL);
324                 mdev->oldest_tle = nob;
325                 kfree(b);
326         }
327
328         spin_unlock_irq(&mdev->req_lock);
329         dec_ap_pending(mdev);
330
331         return;
332
333 bail:
334         spin_unlock_irq(&mdev->req_lock);
335         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338 /**
339  * _tl_restart() - Walks the transfer log, and applies an action to all requests
340  * @mdev:       DRBD device.
341  * @what:       The action/event to perform with all request objects
342  *
343  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344  * restart_frozen_disk_io.
345  */
346 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347 {
348         struct drbd_tl_epoch *b, *tmp, **pn;
349         struct list_head *le, *tle, carry_reads;
350         struct drbd_request *req;
351         int rv, n_writes, n_reads;
352
353         b = mdev->oldest_tle;
354         pn = &mdev->oldest_tle;
355         while (b) {
356                 n_writes = 0;
357                 n_reads = 0;
358                 INIT_LIST_HEAD(&carry_reads);
359                 list_for_each_safe(le, tle, &b->requests) {
360                         req = list_entry(le, struct drbd_request, tl_requests);
361                         rv = _req_mod(req, what);
362
363                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
365                 }
366                 tmp = b->next;
367
368                 if (n_writes) {
369                         if (what == resend) {
370                                 b->n_writes = n_writes;
371                                 if (b->w.cb == NULL) {
372                                         b->w.cb = w_send_barrier;
373                                         inc_ap_pending(mdev);
374                                         set_bit(CREATE_BARRIER, &mdev->flags);
375                                 }
376
377                                 drbd_queue_work(&mdev->data.work, &b->w);
378                         }
379                         pn = &b->next;
380                 } else {
381                         if (n_reads)
382                                 list_add(&carry_reads, &b->requests);
383                         /* there could still be requests on that ring list,
384                          * in case local io is still pending */
385                         list_del(&b->requests);
386
387                         /* dec_ap_pending corresponding to queue_barrier.
388                          * the newest barrier may not have been queued yet,
389                          * in which case w.cb is still NULL. */
390                         if (b->w.cb != NULL)
391                                 dec_ap_pending(mdev);
392
393                         if (b == mdev->newest_tle) {
394                                 /* recycle, but reinit! */
395                                 D_ASSERT(tmp == NULL);
396                                 INIT_LIST_HEAD(&b->requests);
397                                 list_splice(&carry_reads, &b->requests);
398                                 INIT_LIST_HEAD(&b->w.list);
399                                 b->w.cb = NULL;
400                                 b->br_number = net_random();
401                                 b->n_writes = 0;
402
403                                 *pn = b;
404                                 break;
405                         }
406                         *pn = tmp;
407                         kfree(b);
408                 }
409                 b = tmp;
410                 list_splice(&carry_reads, &b->requests);
411         }
412 }
413
414
415 /**
416  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417  * @mdev:       DRBD device.
418  *
419  * This is called after the connection to the peer was lost. The storage covered
420  * by the requests on the transfer gets marked as our of sync. Called from the
421  * receiver thread and the worker thread.
422  */
423 void tl_clear(struct drbd_conf *mdev)
424 {
425         struct list_head *le, *tle;
426         struct drbd_request *r;
427
428         spin_lock_irq(&mdev->req_lock);
429
430         _tl_restart(mdev, connection_lost_while_pending);
431
432         /* we expect this list to be empty. */
433         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435         /* but just in case, clean it up anyways! */
436         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437                 r = list_entry(le, struct drbd_request, tl_requests);
438                 /* It would be nice to complete outside of spinlock.
439                  * But this is easier for now. */
440                 _req_mod(r, connection_lost_while_pending);
441         }
442
443         /* ensure bit indicating barrier is required is clear */
444         clear_bit(CREATE_BARRIER, &mdev->flags);
445
446         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448         spin_unlock_irq(&mdev->req_lock);
449 }
450
451 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452 {
453         spin_lock_irq(&mdev->req_lock);
454         _tl_restart(mdev, what);
455         spin_unlock_irq(&mdev->req_lock);
456 }
457
458 /**
459  * cl_wide_st_chg() - true if the state change is a cluster wide one
460  * @mdev:       DRBD device.
461  * @os:         old (current) state.
462  * @ns:         new (wanted) state.
463  */
464 static int cl_wide_st_chg(struct drbd_conf *mdev,
465                           union drbd_state os, union drbd_state ns)
466 {
467         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474 }
475
476 enum drbd_state_rv
477 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
478                   union drbd_state mask, union drbd_state val)
479 {
480         unsigned long flags;
481         union drbd_state os, ns;
482         enum drbd_state_rv rv;
483
484         spin_lock_irqsave(&mdev->req_lock, flags);
485         os = mdev->state;
486         ns.i = (os.i & ~mask.i) | val.i;
487         rv = _drbd_set_state(mdev, ns, f, NULL);
488         ns = mdev->state;
489         spin_unlock_irqrestore(&mdev->req_lock, flags);
490
491         return rv;
492 }
493
494 /**
495  * drbd_force_state() - Impose a change which happens outside our control on our state
496  * @mdev:       DRBD device.
497  * @mask:       mask of state bits to change.
498  * @val:        value of new state bits.
499  */
500 void drbd_force_state(struct drbd_conf *mdev,
501         union drbd_state mask, union drbd_state val)
502 {
503         drbd_change_state(mdev, CS_HARD, mask, val);
504 }
505
506 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
507 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
508                                                     union drbd_state,
509                                                     union drbd_state);
510 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
511                                        union drbd_state ns, const char **warn_sync_abort);
512 int drbd_send_state_req(struct drbd_conf *,
513                         union drbd_state, union drbd_state);
514
515 static enum drbd_state_rv
516 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
517              union drbd_state val)
518 {
519         union drbd_state os, ns;
520         unsigned long flags;
521         enum drbd_state_rv rv;
522
523         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
524                 return SS_CW_SUCCESS;
525
526         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
527                 return SS_CW_FAILED_BY_PEER;
528
529         rv = 0;
530         spin_lock_irqsave(&mdev->req_lock, flags);
531         os = mdev->state;
532         ns.i = (os.i & ~mask.i) | val.i;
533         ns = sanitize_state(mdev, os, ns, NULL);
534
535         if (!cl_wide_st_chg(mdev, os, ns))
536                 rv = SS_CW_NO_NEED;
537         if (!rv) {
538                 rv = is_valid_state(mdev, ns);
539                 if (rv == SS_SUCCESS) {
540                         rv = is_valid_state_transition(mdev, ns, os);
541                         if (rv == SS_SUCCESS)
542                                 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
543                 }
544         }
545         spin_unlock_irqrestore(&mdev->req_lock, flags);
546
547         return rv;
548 }
549
550 /**
551  * drbd_req_state() - Perform an eventually cluster wide state change
552  * @mdev:       DRBD device.
553  * @mask:       mask of state bits to change.
554  * @val:        value of new state bits.
555  * @f:          flags
556  *
557  * Should not be called directly, use drbd_request_state() or
558  * _drbd_request_state().
559  */
560 static enum drbd_state_rv
561 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
562                union drbd_state val, enum chg_state_flags f)
563 {
564         struct completion done;
565         unsigned long flags;
566         union drbd_state os, ns;
567         enum drbd_state_rv rv;
568
569         init_completion(&done);
570
571         if (f & CS_SERIALIZE)
572                 mutex_lock(&mdev->state_mutex);
573
574         spin_lock_irqsave(&mdev->req_lock, flags);
575         os = mdev->state;
576         ns.i = (os.i & ~mask.i) | val.i;
577         ns = sanitize_state(mdev, os, ns, NULL);
578
579         if (cl_wide_st_chg(mdev, os, ns)) {
580                 rv = is_valid_state(mdev, ns);
581                 if (rv == SS_SUCCESS)
582                         rv = is_valid_state_transition(mdev, ns, os);
583                 spin_unlock_irqrestore(&mdev->req_lock, flags);
584
585                 if (rv < SS_SUCCESS) {
586                         if (f & CS_VERBOSE)
587                                 print_st_err(mdev, os, ns, rv);
588                         goto abort;
589                 }
590
591                 drbd_state_lock(mdev);
592                 if (!drbd_send_state_req(mdev, mask, val)) {
593                         drbd_state_unlock(mdev);
594                         rv = SS_CW_FAILED_BY_PEER;
595                         if (f & CS_VERBOSE)
596                                 print_st_err(mdev, os, ns, rv);
597                         goto abort;
598                 }
599
600                 wait_event(mdev->state_wait,
601                         (rv = _req_st_cond(mdev, mask, val)));
602
603                 if (rv < SS_SUCCESS) {
604                         drbd_state_unlock(mdev);
605                         if (f & CS_VERBOSE)
606                                 print_st_err(mdev, os, ns, rv);
607                         goto abort;
608                 }
609                 spin_lock_irqsave(&mdev->req_lock, flags);
610                 os = mdev->state;
611                 ns.i = (os.i & ~mask.i) | val.i;
612                 rv = _drbd_set_state(mdev, ns, f, &done);
613                 drbd_state_unlock(mdev);
614         } else {
615                 rv = _drbd_set_state(mdev, ns, f, &done);
616         }
617
618         spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
621                 D_ASSERT(current != mdev->worker.task);
622                 wait_for_completion(&done);
623         }
624
625 abort:
626         if (f & CS_SERIALIZE)
627                 mutex_unlock(&mdev->state_mutex);
628
629         return rv;
630 }
631
632 /**
633  * _drbd_request_state() - Request a state change (with flags)
634  * @mdev:       DRBD device.
635  * @mask:       mask of state bits to change.
636  * @val:        value of new state bits.
637  * @f:          flags
638  *
639  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
640  * flag, or when logging of failed state change requests is not desired.
641  */
642 enum drbd_state_rv
643 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
644                     union drbd_state val, enum chg_state_flags f)
645 {
646         enum drbd_state_rv rv;
647
648         wait_event(mdev->state_wait,
649                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
650
651         return rv;
652 }
653
654 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
655 {
656         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
657             name,
658             drbd_conn_str(ns.conn),
659             drbd_role_str(ns.role),
660             drbd_role_str(ns.peer),
661             drbd_disk_str(ns.disk),
662             drbd_disk_str(ns.pdsk),
663             is_susp(ns) ? 's' : 'r',
664             ns.aftr_isp ? 'a' : '-',
665             ns.peer_isp ? 'p' : '-',
666             ns.user_isp ? 'u' : '-'
667             );
668 }
669
670 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
671                   union drbd_state ns, enum drbd_state_rv err)
672 {
673         if (err == SS_IN_TRANSIENT_STATE)
674                 return;
675         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
676         print_st(mdev, " state", os);
677         print_st(mdev, "wanted", ns);
678 }
679
680
681 /**
682  * is_valid_state() - Returns an SS_ error code if ns is not valid
683  * @mdev:       DRBD device.
684  * @ns:         State to consider.
685  */
686 static enum drbd_state_rv
687 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
688 {
689         /* See drbd_state_sw_errors in drbd_strings.c */
690
691         enum drbd_fencing_p fp;
692         enum drbd_state_rv rv = SS_SUCCESS;
693
694         fp = FP_DONT_CARE;
695         if (get_ldev(mdev)) {
696                 fp = mdev->ldev->dc.fencing;
697                 put_ldev(mdev);
698         }
699
700         if (get_net_conf(mdev)) {
701                 if (!mdev->net_conf->two_primaries &&
702                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
703                         rv = SS_TWO_PRIMARIES;
704                 put_net_conf(mdev);
705         }
706
707         if (rv <= 0)
708                 /* already found a reason to abort */;
709         else if (ns.role == R_SECONDARY && mdev->open_cnt)
710                 rv = SS_DEVICE_IN_USE;
711
712         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
713                 rv = SS_NO_UP_TO_DATE_DISK;
714
715         else if (fp >= FP_RESOURCE &&
716                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
717                 rv = SS_PRIMARY_NOP;
718
719         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
720                 rv = SS_NO_UP_TO_DATE_DISK;
721
722         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
723                 rv = SS_NO_LOCAL_DISK;
724
725         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
726                 rv = SS_NO_REMOTE_DISK;
727
728         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
729                 rv = SS_NO_UP_TO_DATE_DISK;
730
731         else if ((ns.conn == C_CONNECTED ||
732                   ns.conn == C_WF_BITMAP_S ||
733                   ns.conn == C_SYNC_SOURCE ||
734                   ns.conn == C_PAUSED_SYNC_S) &&
735                   ns.disk == D_OUTDATED)
736                 rv = SS_CONNECTED_OUTDATES;
737
738         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
739                  (mdev->sync_conf.verify_alg[0] == 0))
740                 rv = SS_NO_VERIFY_ALG;
741
742         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
743                   mdev->agreed_pro_version < 88)
744                 rv = SS_NOT_SUPPORTED;
745
746         return rv;
747 }
748
749 /**
750  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
751  * @mdev:       DRBD device.
752  * @ns:         new state.
753  * @os:         old state.
754  */
755 static enum drbd_state_rv
756 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
757                           union drbd_state os)
758 {
759         enum drbd_state_rv rv = SS_SUCCESS;
760
761         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
762             os.conn > C_CONNECTED)
763                 rv = SS_RESYNC_RUNNING;
764
765         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
766                 rv = SS_ALREADY_STANDALONE;
767
768         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
769                 rv = SS_IS_DISKLESS;
770
771         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
772                 rv = SS_NO_NET_CONFIG;
773
774         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
775                 rv = SS_LOWER_THAN_OUTDATED;
776
777         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
778                 rv = SS_IN_TRANSIENT_STATE;
779
780         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
781                 rv = SS_IN_TRANSIENT_STATE;
782
783         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
784                 rv = SS_NEED_CONNECTION;
785
786         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
787             ns.conn != os.conn && os.conn > C_CONNECTED)
788                 rv = SS_RESYNC_RUNNING;
789
790         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
791             os.conn < C_CONNECTED)
792                 rv = SS_NEED_CONNECTION;
793
794         if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
795             && os.conn < C_WF_REPORT_PARAMS)
796                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
797
798         return rv;
799 }
800
801 /**
802  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
803  * @mdev:       DRBD device.
804  * @os:         old state.
805  * @ns:         new state.
806  * @warn_sync_abort:
807  *
808  * When we loose connection, we have to set the state of the peers disk (pdsk)
809  * to D_UNKNOWN. This rule and many more along those lines are in this function.
810  */
811 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
812                                        union drbd_state ns, const char **warn_sync_abort)
813 {
814         enum drbd_fencing_p fp;
815         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
816
817         fp = FP_DONT_CARE;
818         if (get_ldev(mdev)) {
819                 fp = mdev->ldev->dc.fencing;
820                 put_ldev(mdev);
821         }
822
823         /* Disallow Network errors to configure a device's network part */
824         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
825             os.conn <= C_DISCONNECTING)
826                 ns.conn = os.conn;
827
828         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
829          * If you try to go into some Sync* state, that shall fail (elsewhere). */
830         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
831             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
832                 ns.conn = os.conn;
833
834         /* we cannot fail (again) if we already detached */
835         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
836                 ns.disk = D_DISKLESS;
837
838         /* if we are only D_ATTACHING yet,
839          * we can (and should) go directly to D_DISKLESS. */
840         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
841                 ns.disk = D_DISKLESS;
842
843         /* After C_DISCONNECTING only C_STANDALONE may follow */
844         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
845                 ns.conn = os.conn;
846
847         if (ns.conn < C_CONNECTED) {
848                 ns.peer_isp = 0;
849                 ns.peer = R_UNKNOWN;
850                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
851                         ns.pdsk = D_UNKNOWN;
852         }
853
854         /* Clear the aftr_isp when becoming unconfigured */
855         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
856                 ns.aftr_isp = 0;
857
858         /* Abort resync if a disk fails/detaches */
859         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
860             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
861                 if (warn_sync_abort)
862                         *warn_sync_abort =
863                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
864                                 "Online-verify" : "Resync";
865                 ns.conn = C_CONNECTED;
866         }
867
868         /* Connection breaks down before we finished "Negotiating" */
869         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
870             get_ldev_if_state(mdev, D_NEGOTIATING)) {
871                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
872                         ns.disk = mdev->new_state_tmp.disk;
873                         ns.pdsk = mdev->new_state_tmp.pdsk;
874                 } else {
875                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
876                         ns.disk = D_DISKLESS;
877                         ns.pdsk = D_UNKNOWN;
878                 }
879                 put_ldev(mdev);
880         }
881
882         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
883         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
884                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
885                         ns.disk = D_UP_TO_DATE;
886                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
887                         ns.pdsk = D_UP_TO_DATE;
888         }
889
890         /* Implications of the connection stat on the disk states */
891         disk_min = D_DISKLESS;
892         disk_max = D_UP_TO_DATE;
893         pdsk_min = D_INCONSISTENT;
894         pdsk_max = D_UNKNOWN;
895         switch ((enum drbd_conns)ns.conn) {
896         case C_WF_BITMAP_T:
897         case C_PAUSED_SYNC_T:
898         case C_STARTING_SYNC_T:
899         case C_WF_SYNC_UUID:
900         case C_BEHIND:
901                 disk_min = D_INCONSISTENT;
902                 disk_max = D_OUTDATED;
903                 pdsk_min = D_UP_TO_DATE;
904                 pdsk_max = D_UP_TO_DATE;
905                 break;
906         case C_VERIFY_S:
907         case C_VERIFY_T:
908                 disk_min = D_UP_TO_DATE;
909                 disk_max = D_UP_TO_DATE;
910                 pdsk_min = D_UP_TO_DATE;
911                 pdsk_max = D_UP_TO_DATE;
912                 break;
913         case C_CONNECTED:
914                 disk_min = D_DISKLESS;
915                 disk_max = D_UP_TO_DATE;
916                 pdsk_min = D_DISKLESS;
917                 pdsk_max = D_UP_TO_DATE;
918                 break;
919         case C_WF_BITMAP_S:
920         case C_PAUSED_SYNC_S:
921         case C_STARTING_SYNC_S:
922         case C_AHEAD:
923                 disk_min = D_UP_TO_DATE;
924                 disk_max = D_UP_TO_DATE;
925                 pdsk_min = D_INCONSISTENT;
926                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
927                 break;
928         case C_SYNC_TARGET:
929                 disk_min = D_INCONSISTENT;
930                 disk_max = D_INCONSISTENT;
931                 pdsk_min = D_UP_TO_DATE;
932                 pdsk_max = D_UP_TO_DATE;
933                 break;
934         case C_SYNC_SOURCE:
935                 disk_min = D_UP_TO_DATE;
936                 disk_max = D_UP_TO_DATE;
937                 pdsk_min = D_INCONSISTENT;
938                 pdsk_max = D_INCONSISTENT;
939                 break;
940         case C_STANDALONE:
941         case C_DISCONNECTING:
942         case C_UNCONNECTED:
943         case C_TIMEOUT:
944         case C_BROKEN_PIPE:
945         case C_NETWORK_FAILURE:
946         case C_PROTOCOL_ERROR:
947         case C_TEAR_DOWN:
948         case C_WF_CONNECTION:
949         case C_WF_REPORT_PARAMS:
950         case C_MASK:
951                 break;
952         }
953         if (ns.disk > disk_max)
954                 ns.disk = disk_max;
955
956         if (ns.disk < disk_min) {
957                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
958                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
959                 ns.disk = disk_min;
960         }
961         if (ns.pdsk > pdsk_max)
962                 ns.pdsk = pdsk_max;
963
964         if (ns.pdsk < pdsk_min) {
965                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
966                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
967                 ns.pdsk = pdsk_min;
968         }
969
970         if (fp == FP_STONITH &&
971             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
972             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
973                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
974
975         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
976             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
977             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
978                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
979
980         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
981                 if (ns.conn == C_SYNC_SOURCE)
982                         ns.conn = C_PAUSED_SYNC_S;
983                 if (ns.conn == C_SYNC_TARGET)
984                         ns.conn = C_PAUSED_SYNC_T;
985         } else {
986                 if (ns.conn == C_PAUSED_SYNC_S)
987                         ns.conn = C_SYNC_SOURCE;
988                 if (ns.conn == C_PAUSED_SYNC_T)
989                         ns.conn = C_SYNC_TARGET;
990         }
991
992         return ns;
993 }
994
995 /* helper for __drbd_set_state */
996 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
997 {
998         if (mdev->agreed_pro_version < 90)
999                 mdev->ov_start_sector = 0;
1000         mdev->rs_total = drbd_bm_bits(mdev);
1001         mdev->ov_position = 0;
1002         if (cs == C_VERIFY_T) {
1003                 /* starting online verify from an arbitrary position
1004                  * does not fit well into the existing protocol.
1005                  * on C_VERIFY_T, we initialize ov_left and friends
1006                  * implicitly in receive_DataRequest once the
1007                  * first P_OV_REQUEST is received */
1008                 mdev->ov_start_sector = ~(sector_t)0;
1009         } else {
1010                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1011                 if (bit >= mdev->rs_total) {
1012                         mdev->ov_start_sector =
1013                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1014                         mdev->rs_total = 1;
1015                 } else
1016                         mdev->rs_total -= bit;
1017                 mdev->ov_position = mdev->ov_start_sector;
1018         }
1019         mdev->ov_left = mdev->rs_total;
1020 }
1021
1022 static void drbd_resume_al(struct drbd_conf *mdev)
1023 {
1024         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1025                 dev_info(DEV, "Resumed AL updates\n");
1026 }
1027
1028 /**
1029  * __drbd_set_state() - Set a new DRBD state
1030  * @mdev:       DRBD device.
1031  * @ns:         new state.
1032  * @flags:      Flags
1033  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1034  *
1035  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1036  */
1037 enum drbd_state_rv
1038 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1039                  enum chg_state_flags flags, struct completion *done)
1040 {
1041         union drbd_state os;
1042         enum drbd_state_rv rv = SS_SUCCESS;
1043         const char *warn_sync_abort = NULL;
1044         struct after_state_chg_work *ascw;
1045
1046         os = mdev->state;
1047
1048         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1049
1050         if (ns.i == os.i)
1051                 return SS_NOTHING_TO_DO;
1052
1053         if (!(flags & CS_HARD)) {
1054                 /*  pre-state-change checks ; only look at ns  */
1055                 /* See drbd_state_sw_errors in drbd_strings.c */
1056
1057                 rv = is_valid_state(mdev, ns);
1058                 if (rv < SS_SUCCESS) {
1059                         /* If the old state was illegal as well, then let
1060                            this happen...*/
1061
1062                         if (is_valid_state(mdev, os) == rv)
1063                                 rv = is_valid_state_transition(mdev, ns, os);
1064                 } else
1065                         rv = is_valid_state_transition(mdev, ns, os);
1066         }
1067
1068         if (rv < SS_SUCCESS) {
1069                 if (flags & CS_VERBOSE)
1070                         print_st_err(mdev, os, ns, rv);
1071                 return rv;
1072         }
1073
1074         if (warn_sync_abort)
1075                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1076
1077         {
1078         char *pbp, pb[300];
1079         pbp = pb;
1080         *pbp = 0;
1081         if (ns.role != os.role)
1082                 pbp += sprintf(pbp, "role( %s -> %s ) ",
1083                                drbd_role_str(os.role),
1084                                drbd_role_str(ns.role));
1085         if (ns.peer != os.peer)
1086                 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1087                                drbd_role_str(os.peer),
1088                                drbd_role_str(ns.peer));
1089         if (ns.conn != os.conn)
1090                 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1091                                drbd_conn_str(os.conn),
1092                                drbd_conn_str(ns.conn));
1093         if (ns.disk != os.disk)
1094                 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1095                                drbd_disk_str(os.disk),
1096                                drbd_disk_str(ns.disk));
1097         if (ns.pdsk != os.pdsk)
1098                 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1099                                drbd_disk_str(os.pdsk),
1100                                drbd_disk_str(ns.pdsk));
1101         if (is_susp(ns) != is_susp(os))
1102                 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1103                                is_susp(os),
1104                                is_susp(ns));
1105         if (ns.aftr_isp != os.aftr_isp)
1106                 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1107                                os.aftr_isp,
1108                                ns.aftr_isp);
1109         if (ns.peer_isp != os.peer_isp)
1110                 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1111                                os.peer_isp,
1112                                ns.peer_isp);
1113         if (ns.user_isp != os.user_isp)
1114                 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1115                                os.user_isp,
1116                                ns.user_isp);
1117         dev_info(DEV, "%s\n", pb);
1118         }
1119
1120         /* solve the race between becoming unconfigured,
1121          * worker doing the cleanup, and
1122          * admin reconfiguring us:
1123          * on (re)configure, first set CONFIG_PENDING,
1124          * then wait for a potentially exiting worker,
1125          * start the worker, and schedule one no_op.
1126          * then proceed with configuration.
1127          */
1128         if (ns.disk == D_DISKLESS &&
1129             ns.conn == C_STANDALONE &&
1130             ns.role == R_SECONDARY &&
1131             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1132                 set_bit(DEVICE_DYING, &mdev->flags);
1133
1134         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1135          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1136          * drbd_ldev_destroy() won't happen before our corresponding
1137          * after_state_ch works run, where we put_ldev again. */
1138         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1139             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1140                 atomic_inc(&mdev->local_cnt);
1141
1142         mdev->state = ns;
1143         wake_up(&mdev->misc_wait);
1144         wake_up(&mdev->state_wait);
1145
1146         /* aborted verify run. log the last position */
1147         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1148             ns.conn < C_CONNECTED) {
1149                 mdev->ov_start_sector =
1150                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1151                 dev_info(DEV, "Online Verify reached sector %llu\n",
1152                         (unsigned long long)mdev->ov_start_sector);
1153         }
1154
1155         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1156             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1157                 dev_info(DEV, "Syncer continues.\n");
1158                 mdev->rs_paused += (long)jiffies
1159                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1160                 if (ns.conn == C_SYNC_TARGET)
1161                         mod_timer(&mdev->resync_timer, jiffies);
1162         }
1163
1164         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1165             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1166                 dev_info(DEV, "Resync suspended\n");
1167                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1168         }
1169
1170         if (os.conn == C_CONNECTED &&
1171             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1172                 unsigned long now = jiffies;
1173                 int i;
1174
1175                 set_ov_position(mdev, ns.conn);
1176                 mdev->rs_start = now;
1177                 mdev->rs_last_events = 0;
1178                 mdev->rs_last_sect_ev = 0;
1179                 mdev->ov_last_oos_size = 0;
1180                 mdev->ov_last_oos_start = 0;
1181
1182                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1183                         mdev->rs_mark_left[i] = mdev->ov_left;
1184                         mdev->rs_mark_time[i] = now;
1185                 }
1186
1187                 drbd_rs_controller_reset(mdev);
1188
1189                 if (ns.conn == C_VERIFY_S) {
1190                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1191                                         (unsigned long long)mdev->ov_position);
1192                         mod_timer(&mdev->resync_timer, jiffies);
1193                 }
1194         }
1195
1196         if (get_ldev(mdev)) {
1197                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1198                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1199                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1200
1201                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1202                         mdf |= MDF_CRASHED_PRIMARY;
1203                 if (mdev->state.role == R_PRIMARY ||
1204                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1205                         mdf |= MDF_PRIMARY_IND;
1206                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1207                         mdf |= MDF_CONNECTED_IND;
1208                 if (mdev->state.disk > D_INCONSISTENT)
1209                         mdf |= MDF_CONSISTENT;
1210                 if (mdev->state.disk > D_OUTDATED)
1211                         mdf |= MDF_WAS_UP_TO_DATE;
1212                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1213                         mdf |= MDF_PEER_OUT_DATED;
1214                 if (mdf != mdev->ldev->md.flags) {
1215                         mdev->ldev->md.flags = mdf;
1216                         drbd_md_mark_dirty(mdev);
1217                 }
1218                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1219                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1220                 put_ldev(mdev);
1221         }
1222
1223         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1224         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1225             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1226                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1227
1228         /* Receiver should clean up itself */
1229         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1230                 drbd_thread_stop_nowait(&mdev->receiver);
1231
1232         /* Now the receiver finished cleaning up itself, it should die */
1233         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1234                 drbd_thread_stop_nowait(&mdev->receiver);
1235
1236         /* Upon network failure, we need to restart the receiver. */
1237         if (os.conn > C_TEAR_DOWN &&
1238             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1239                 drbd_thread_restart_nowait(&mdev->receiver);
1240
1241         /* Resume AL writing if we get a connection */
1242         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1243                 drbd_resume_al(mdev);
1244
1245         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1246         if (ascw) {
1247                 ascw->os = os;
1248                 ascw->ns = ns;
1249                 ascw->flags = flags;
1250                 ascw->w.cb = w_after_state_ch;
1251                 ascw->done = done;
1252                 drbd_queue_work(&mdev->data.work, &ascw->w);
1253         } else {
1254                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1255         }
1256
1257         return rv;
1258 }
1259
1260 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1261 {
1262         struct after_state_chg_work *ascw =
1263                 container_of(w, struct after_state_chg_work, w);
1264         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1265         if (ascw->flags & CS_WAIT_COMPLETE) {
1266                 D_ASSERT(ascw->done != NULL);
1267                 complete(ascw->done);
1268         }
1269         kfree(ascw);
1270
1271         return 1;
1272 }
1273
1274 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1275 {
1276         if (rv) {
1277                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1278                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1279                 return;
1280         }
1281
1282         switch (mdev->state.conn) {
1283         case C_STARTING_SYNC_T:
1284                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1285                 break;
1286         case C_STARTING_SYNC_S:
1287                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1288                 break;
1289         }
1290 }
1291
1292 /**
1293  * after_state_ch() - Perform after state change actions that may sleep
1294  * @mdev:       DRBD device.
1295  * @os:         old state.
1296  * @ns:         new state.
1297  * @flags:      Flags
1298  */
1299 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1300                            union drbd_state ns, enum chg_state_flags flags)
1301 {
1302         enum drbd_fencing_p fp;
1303         enum drbd_req_event what = nothing;
1304         union drbd_state nsm = (union drbd_state){ .i = -1 };
1305
1306         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1307                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1308                 if (mdev->p_uuid)
1309                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1310         }
1311
1312         fp = FP_DONT_CARE;
1313         if (get_ldev(mdev)) {
1314                 fp = mdev->ldev->dc.fencing;
1315                 put_ldev(mdev);
1316         }
1317
1318         /* Inform userspace about the change... */
1319         drbd_bcast_state(mdev, ns);
1320
1321         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1322             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1323                 drbd_khelper(mdev, "pri-on-incon-degr");
1324
1325         /* Here we have the actions that are performed after a
1326            state change. This function might sleep */
1327
1328         nsm.i = -1;
1329         if (ns.susp_nod) {
1330                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1331                         if (ns.conn == C_CONNECTED)
1332                                 what = resend, nsm.susp_nod = 0;
1333                         else /* ns.conn > C_CONNECTED */
1334                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1335                 }
1336
1337                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1338                         what = restart_frozen_disk_io, nsm.susp_nod = 0;
1339
1340         }
1341
1342         if (ns.susp_fen) {
1343                 /* case1: The outdate peer handler is successful: */
1344                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1345                         tl_clear(mdev);
1346                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1347                                 drbd_uuid_new_current(mdev);
1348                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1349                         }
1350                         spin_lock_irq(&mdev->req_lock);
1351                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1352                         spin_unlock_irq(&mdev->req_lock);
1353                 }
1354                 /* case2: The connection was established again: */
1355                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1356                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1357                         what = resend;
1358                         nsm.susp_fen = 0;
1359                 }
1360         }
1361
1362         if (what != nothing) {
1363                 spin_lock_irq(&mdev->req_lock);
1364                 _tl_restart(mdev, what);
1365                 nsm.i &= mdev->state.i;
1366                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1367                 spin_unlock_irq(&mdev->req_lock);
1368         }
1369
1370         /* Do not change the order of the if above and the two below... */
1371         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1372                 drbd_send_uuids(mdev);
1373                 drbd_send_state(mdev);
1374         }
1375         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1376                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1377
1378         /* Lost contact to peer's copy of the data */
1379         if ((os.pdsk >= D_INCONSISTENT &&
1380              os.pdsk != D_UNKNOWN &&
1381              os.pdsk != D_OUTDATED)
1382         &&  (ns.pdsk < D_INCONSISTENT ||
1383              ns.pdsk == D_UNKNOWN ||
1384              ns.pdsk == D_OUTDATED)) {
1385                 if (get_ldev(mdev)) {
1386                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1387                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1388                                 if (is_susp(mdev->state)) {
1389                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1390                                 } else {
1391                                         drbd_uuid_new_current(mdev);
1392                                         drbd_send_uuids(mdev);
1393                                 }
1394                         }
1395                         put_ldev(mdev);
1396                 }
1397         }
1398
1399         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1400                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1401                         drbd_uuid_new_current(mdev);
1402                         drbd_send_uuids(mdev);
1403                 }
1404
1405                 /* D_DISKLESS Peer becomes secondary */
1406                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1407                         drbd_al_to_on_disk_bm(mdev);
1408                 put_ldev(mdev);
1409         }
1410
1411         /* Last part of the attaching process ... */
1412         if (ns.conn >= C_CONNECTED &&
1413             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1414                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1415                 drbd_send_uuids(mdev);
1416                 drbd_send_state(mdev);
1417         }
1418
1419         /* We want to pause/continue resync, tell peer. */
1420         if (ns.conn >= C_CONNECTED &&
1421              ((os.aftr_isp != ns.aftr_isp) ||
1422               (os.user_isp != ns.user_isp)))
1423                 drbd_send_state(mdev);
1424
1425         /* In case one of the isp bits got set, suspend other devices. */
1426         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1427             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1428                 suspend_other_sg(mdev);
1429
1430         /* Make sure the peer gets informed about eventual state
1431            changes (ISP bits) while we were in WFReportParams. */
1432         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1433                 drbd_send_state(mdev);
1434
1435         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1436                 drbd_send_state(mdev);
1437
1438         /* We are in the progress to start a full sync... */
1439         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1440             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1441                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1442
1443         /* We are invalidating our self... */
1444         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1445             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1446                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1447
1448         /* first half of local IO error, failure to attach,
1449          * or administrative detach */
1450         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1451                 enum drbd_io_error_p eh;
1452                 int was_io_error;
1453                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1454                  * our cleanup here with the transition to D_DISKLESS,
1455                  * so it is safe to dreference ldev here. */
1456                 eh = mdev->ldev->dc.on_io_error;
1457                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1458
1459                 /* current state still has to be D_FAILED,
1460                  * there is only one way out: to D_DISKLESS,
1461                  * and that may only happen after our put_ldev below. */
1462                 if (mdev->state.disk != D_FAILED)
1463                         dev_err(DEV,
1464                                 "ASSERT FAILED: disk is %s during detach\n",
1465                                 drbd_disk_str(mdev->state.disk));
1466
1467                 if (drbd_send_state(mdev))
1468                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1469                 else
1470                         dev_err(DEV, "Sending state for detaching disk failed\n");
1471
1472                 drbd_rs_cancel_all(mdev);
1473
1474                 /* In case we want to get something to stable storage still,
1475                  * this may be the last chance.
1476                  * Following put_ldev may transition to D_DISKLESS. */
1477                 drbd_md_sync(mdev);
1478                 put_ldev(mdev);
1479
1480                 if (was_io_error && eh == EP_CALL_HELPER)
1481                         drbd_khelper(mdev, "local-io-error");
1482         }
1483
1484         /* second half of local IO error, failure to attach,
1485          * or administrative detach,
1486          * after local_cnt references have reached zero again */
1487         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1488                 /* We must still be diskless,
1489                  * re-attach has to be serialized with this! */
1490                 if (mdev->state.disk != D_DISKLESS)
1491                         dev_err(DEV,
1492                                 "ASSERT FAILED: disk is %s while going diskless\n",
1493                                 drbd_disk_str(mdev->state.disk));
1494
1495                 mdev->rs_total = 0;
1496                 mdev->rs_failed = 0;
1497                 atomic_set(&mdev->rs_pending_cnt, 0);
1498
1499                 if (drbd_send_state(mdev))
1500                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1501                 else
1502                         dev_err(DEV, "Sending state for being diskless failed\n");
1503                 /* corresponding get_ldev in __drbd_set_state
1504                  * this may finaly trigger drbd_ldev_destroy. */
1505                 put_ldev(mdev);
1506         }
1507
1508         /* Disks got bigger while they were detached */
1509         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1510             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1511                 if (ns.conn == C_CONNECTED)
1512                         resync_after_online_grow(mdev);
1513         }
1514
1515         /* A resync finished or aborted, wake paused devices... */
1516         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1517             (os.peer_isp && !ns.peer_isp) ||
1518             (os.user_isp && !ns.user_isp))
1519                 resume_next_sg(mdev);
1520
1521         /* sync target done with resync.  Explicitly notify peer, even though
1522          * it should (at least for non-empty resyncs) already know itself. */
1523         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1524                 drbd_send_state(mdev);
1525
1526         /* free tl_hash if we Got thawed and are C_STANDALONE */
1527         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1528                 drbd_free_tl_hash(mdev);
1529
1530         /* Upon network connection, we need to start the receiver */
1531         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1532                 drbd_thread_start(&mdev->receiver);
1533
1534         /* Terminate worker thread if we are unconfigured - it will be
1535            restarted as needed... */
1536         if (ns.disk == D_DISKLESS &&
1537             ns.conn == C_STANDALONE &&
1538             ns.role == R_SECONDARY) {
1539                 if (os.aftr_isp != ns.aftr_isp)
1540                         resume_next_sg(mdev);
1541                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1542                 if (test_bit(DEVICE_DYING, &mdev->flags))
1543                         drbd_thread_stop_nowait(&mdev->worker);
1544         }
1545
1546         drbd_md_sync(mdev);
1547 }
1548
1549
1550 static int drbd_thread_setup(void *arg)
1551 {
1552         struct drbd_thread *thi = (struct drbd_thread *) arg;
1553         struct drbd_conf *mdev = thi->mdev;
1554         unsigned long flags;
1555         int retval;
1556
1557 restart:
1558         retval = thi->function(thi);
1559
1560         spin_lock_irqsave(&thi->t_lock, flags);
1561
1562         /* if the receiver has been "Exiting", the last thing it did
1563          * was set the conn state to "StandAlone",
1564          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1565          * and receiver thread will be "started".
1566          * drbd_thread_start needs to set "Restarting" in that case.
1567          * t_state check and assignment needs to be within the same spinlock,
1568          * so either thread_start sees Exiting, and can remap to Restarting,
1569          * or thread_start see None, and can proceed as normal.
1570          */
1571
1572         if (thi->t_state == Restarting) {
1573                 dev_info(DEV, "Restarting %s\n", current->comm);
1574                 thi->t_state = Running;
1575                 spin_unlock_irqrestore(&thi->t_lock, flags);
1576                 goto restart;
1577         }
1578
1579         thi->task = NULL;
1580         thi->t_state = None;
1581         smp_mb();
1582         complete(&thi->stop);
1583         spin_unlock_irqrestore(&thi->t_lock, flags);
1584
1585         dev_info(DEV, "Terminating %s\n", current->comm);
1586
1587         /* Release mod reference taken when thread was started */
1588         module_put(THIS_MODULE);
1589         return retval;
1590 }
1591
1592 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1593                       int (*func) (struct drbd_thread *))
1594 {
1595         spin_lock_init(&thi->t_lock);
1596         thi->task    = NULL;
1597         thi->t_state = None;
1598         thi->function = func;
1599         thi->mdev = mdev;
1600 }
1601
1602 int drbd_thread_start(struct drbd_thread *thi)
1603 {
1604         struct drbd_conf *mdev = thi->mdev;
1605         struct task_struct *nt;
1606         unsigned long flags;
1607
1608         const char *me =
1609                 thi == &mdev->receiver ? "receiver" :
1610                 thi == &mdev->asender  ? "asender"  :
1611                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1612
1613         /* is used from state engine doing drbd_thread_stop_nowait,
1614          * while holding the req lock irqsave */
1615         spin_lock_irqsave(&thi->t_lock, flags);
1616
1617         switch (thi->t_state) {
1618         case None:
1619                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1620                                 me, current->comm, current->pid);
1621
1622                 /* Get ref on module for thread - this is released when thread exits */
1623                 if (!try_module_get(THIS_MODULE)) {
1624                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1625                         spin_unlock_irqrestore(&thi->t_lock, flags);
1626                         return false;
1627                 }
1628
1629                 init_completion(&thi->stop);
1630                 D_ASSERT(thi->task == NULL);
1631                 thi->reset_cpu_mask = 1;
1632                 thi->t_state = Running;
1633                 spin_unlock_irqrestore(&thi->t_lock, flags);
1634                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1635
1636                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1637                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1638
1639                 if (IS_ERR(nt)) {
1640                         dev_err(DEV, "Couldn't start thread\n");
1641
1642                         module_put(THIS_MODULE);
1643                         return false;
1644                 }
1645                 spin_lock_irqsave(&thi->t_lock, flags);
1646                 thi->task = nt;
1647                 thi->t_state = Running;
1648                 spin_unlock_irqrestore(&thi->t_lock, flags);
1649                 wake_up_process(nt);
1650                 break;
1651         case Exiting:
1652                 thi->t_state = Restarting;
1653                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1654                                 me, current->comm, current->pid);
1655                 /* fall through */
1656         case Running:
1657         case Restarting:
1658         default:
1659                 spin_unlock_irqrestore(&thi->t_lock, flags);
1660                 break;
1661         }
1662
1663         return true;
1664 }
1665
1666
1667 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1668 {
1669         unsigned long flags;
1670
1671         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1672
1673         /* may be called from state engine, holding the req lock irqsave */
1674         spin_lock_irqsave(&thi->t_lock, flags);
1675
1676         if (thi->t_state == None) {
1677                 spin_unlock_irqrestore(&thi->t_lock, flags);
1678                 if (restart)
1679                         drbd_thread_start(thi);
1680                 return;
1681         }
1682
1683         if (thi->t_state != ns) {
1684                 if (thi->task == NULL) {
1685                         spin_unlock_irqrestore(&thi->t_lock, flags);
1686                         return;
1687                 }
1688
1689                 thi->t_state = ns;
1690                 smp_mb();
1691                 init_completion(&thi->stop);
1692                 if (thi->task != current)
1693                         force_sig(DRBD_SIGKILL, thi->task);
1694
1695         }
1696
1697         spin_unlock_irqrestore(&thi->t_lock, flags);
1698
1699         if (wait)
1700                 wait_for_completion(&thi->stop);
1701 }
1702
1703 #ifdef CONFIG_SMP
1704 /**
1705  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1706  * @mdev:       DRBD device.
1707  *
1708  * Forces all threads of a device onto the same CPU. This is beneficial for
1709  * DRBD's performance. May be overwritten by user's configuration.
1710  */
1711 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1712 {
1713         int ord, cpu;
1714
1715         /* user override. */
1716         if (cpumask_weight(mdev->cpu_mask))
1717                 return;
1718
1719         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1720         for_each_online_cpu(cpu) {
1721                 if (ord-- == 0) {
1722                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1723                         return;
1724                 }
1725         }
1726         /* should not be reached */
1727         cpumask_setall(mdev->cpu_mask);
1728 }
1729
1730 /**
1731  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1732  * @mdev:       DRBD device.
1733  *
1734  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1735  * prematurely.
1736  */
1737 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1738 {
1739         struct task_struct *p = current;
1740         struct drbd_thread *thi =
1741                 p == mdev->asender.task  ? &mdev->asender  :
1742                 p == mdev->receiver.task ? &mdev->receiver :
1743                 p == mdev->worker.task   ? &mdev->worker   :
1744                 NULL;
1745         ERR_IF(thi == NULL)
1746                 return;
1747         if (!thi->reset_cpu_mask)
1748                 return;
1749         thi->reset_cpu_mask = 0;
1750         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1751 }
1752 #endif
1753
1754 /* the appropriate socket mutex must be held already */
1755 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1756                           enum drbd_packets cmd, struct p_header80 *h,
1757                           size_t size, unsigned msg_flags)
1758 {
1759         int sent, ok;
1760
1761         ERR_IF(!h) return false;
1762         ERR_IF(!size) return false;
1763
1764         h->magic   = BE_DRBD_MAGIC;
1765         h->command = cpu_to_be16(cmd);
1766         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1767
1768         sent = drbd_send(mdev, sock, h, size, msg_flags);
1769
1770         ok = (sent == size);
1771         if (!ok)
1772                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1773                     cmdname(cmd), (int)size, sent);
1774         return ok;
1775 }
1776
1777 /* don't pass the socket. we may only look at it
1778  * when we hold the appropriate socket mutex.
1779  */
1780 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1781                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1782 {
1783         int ok = 0;
1784         struct socket *sock;
1785
1786         if (use_data_socket) {
1787                 mutex_lock(&mdev->data.mutex);
1788                 sock = mdev->data.socket;
1789         } else {
1790                 mutex_lock(&mdev->meta.mutex);
1791                 sock = mdev->meta.socket;
1792         }
1793
1794         /* drbd_disconnect() could have called drbd_free_sock()
1795          * while we were waiting in down()... */
1796         if (likely(sock != NULL))
1797                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1798
1799         if (use_data_socket)
1800                 mutex_unlock(&mdev->data.mutex);
1801         else
1802                 mutex_unlock(&mdev->meta.mutex);
1803         return ok;
1804 }
1805
1806 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1807                    size_t size)
1808 {
1809         struct p_header80 h;
1810         int ok;
1811
1812         h.magic   = BE_DRBD_MAGIC;
1813         h.command = cpu_to_be16(cmd);
1814         h.length  = cpu_to_be16(size);
1815
1816         if (!drbd_get_data_sock(mdev))
1817                 return 0;
1818
1819         ok = (sizeof(h) ==
1820                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1821         ok = ok && (size ==
1822                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1823
1824         drbd_put_data_sock(mdev);
1825
1826         return ok;
1827 }
1828
1829 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1830 {
1831         struct p_rs_param_95 *p;
1832         struct socket *sock;
1833         int size, rv;
1834         const int apv = mdev->agreed_pro_version;
1835
1836         size = apv <= 87 ? sizeof(struct p_rs_param)
1837                 : apv == 88 ? sizeof(struct p_rs_param)
1838                         + strlen(mdev->sync_conf.verify_alg) + 1
1839                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1840                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1841
1842         /* used from admin command context and receiver/worker context.
1843          * to avoid kmalloc, grab the socket right here,
1844          * then use the pre-allocated sbuf there */
1845         mutex_lock(&mdev->data.mutex);
1846         sock = mdev->data.socket;
1847
1848         if (likely(sock != NULL)) {
1849                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1850
1851                 p = &mdev->data.sbuf.rs_param_95;
1852
1853                 /* initialize verify_alg and csums_alg */
1854                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1855
1856                 p->rate = cpu_to_be32(sc->rate);
1857                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1858                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1859                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1860                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1861
1862                 if (apv >= 88)
1863                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1864                 if (apv >= 89)
1865                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1866
1867                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1868         } else
1869                 rv = 0; /* not ok */
1870
1871         mutex_unlock(&mdev->data.mutex);
1872
1873         return rv;
1874 }
1875
1876 int drbd_send_protocol(struct drbd_conf *mdev)
1877 {
1878         struct p_protocol *p;
1879         int size, cf, rv;
1880
1881         size = sizeof(struct p_protocol);
1882
1883         if (mdev->agreed_pro_version >= 87)
1884                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1885
1886         /* we must not recurse into our own queue,
1887          * as that is blocked during handshake */
1888         p = kmalloc(size, GFP_NOIO);
1889         if (p == NULL)
1890                 return 0;
1891
1892         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1893         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1894         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1895         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1896         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1897
1898         cf = 0;
1899         if (mdev->net_conf->want_lose)
1900                 cf |= CF_WANT_LOSE;
1901         if (mdev->net_conf->dry_run) {
1902                 if (mdev->agreed_pro_version >= 92)
1903                         cf |= CF_DRY_RUN;
1904                 else {
1905                         dev_err(DEV, "--dry-run is not supported by peer");
1906                         kfree(p);
1907                         return 0;
1908                 }
1909         }
1910         p->conn_flags    = cpu_to_be32(cf);
1911
1912         if (mdev->agreed_pro_version >= 87)
1913                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1914
1915         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1916                            (struct p_header80 *)p, size);
1917         kfree(p);
1918         return rv;
1919 }
1920
1921 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1922 {
1923         struct p_uuids p;
1924         int i;
1925
1926         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1927                 return 1;
1928
1929         for (i = UI_CURRENT; i < UI_SIZE; i++)
1930                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1931
1932         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1933         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1934         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1935         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1936         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1937         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1938
1939         put_ldev(mdev);
1940
1941         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1942                              (struct p_header80 *)&p, sizeof(p));
1943 }
1944
1945 int drbd_send_uuids(struct drbd_conf *mdev)
1946 {
1947         return _drbd_send_uuids(mdev, 0);
1948 }
1949
1950 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1951 {
1952         return _drbd_send_uuids(mdev, 8);
1953 }
1954
1955
1956 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1957 {
1958         struct p_rs_uuid p;
1959
1960         p.uuid = cpu_to_be64(val);
1961
1962         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1963                              (struct p_header80 *)&p, sizeof(p));
1964 }
1965
1966 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1967 {
1968         struct p_sizes p;
1969         sector_t d_size, u_size;
1970         int q_order_type;
1971         int ok;
1972
1973         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1974                 D_ASSERT(mdev->ldev->backing_bdev);
1975                 d_size = drbd_get_max_capacity(mdev->ldev);
1976                 u_size = mdev->ldev->dc.disk_size;
1977                 q_order_type = drbd_queue_order_type(mdev);
1978                 put_ldev(mdev);
1979         } else {
1980                 d_size = 0;
1981                 u_size = 0;
1982                 q_order_type = QUEUE_ORDERED_NONE;
1983         }
1984
1985         p.d_size = cpu_to_be64(d_size);
1986         p.u_size = cpu_to_be64(u_size);
1987         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1988         p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1989         p.queue_order_type = cpu_to_be16(q_order_type);
1990         p.dds_flags = cpu_to_be16(flags);
1991
1992         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1993                            (struct p_header80 *)&p, sizeof(p));
1994         return ok;
1995 }
1996
1997 /**
1998  * drbd_send_state() - Sends the drbd state to the peer
1999  * @mdev:       DRBD device.
2000  */
2001 int drbd_send_state(struct drbd_conf *mdev)
2002 {
2003         struct socket *sock;
2004         struct p_state p;
2005         int ok = 0;
2006
2007         /* Grab state lock so we wont send state if we're in the middle
2008          * of a cluster wide state change on another thread */
2009         drbd_state_lock(mdev);
2010
2011         mutex_lock(&mdev->data.mutex);
2012
2013         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2014         sock = mdev->data.socket;
2015
2016         if (likely(sock != NULL)) {
2017                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2018                                     (struct p_header80 *)&p, sizeof(p), 0);
2019         }
2020
2021         mutex_unlock(&mdev->data.mutex);
2022
2023         drbd_state_unlock(mdev);
2024         return ok;
2025 }
2026
2027 int drbd_send_state_req(struct drbd_conf *mdev,
2028         union drbd_state mask, union drbd_state val)
2029 {
2030         struct p_req_state p;
2031
2032         p.mask    = cpu_to_be32(mask.i);
2033         p.val     = cpu_to_be32(val.i);
2034
2035         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2036                              (struct p_header80 *)&p, sizeof(p));
2037 }
2038
2039 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2040 {
2041         struct p_req_state_reply p;
2042
2043         p.retcode    = cpu_to_be32(retcode);
2044
2045         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2046                              (struct p_header80 *)&p, sizeof(p));
2047 }
2048
2049 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2050         struct p_compressed_bm *p,
2051         struct bm_xfer_ctx *c)
2052 {
2053         struct bitstream bs;
2054         unsigned long plain_bits;
2055         unsigned long tmp;
2056         unsigned long rl;
2057         unsigned len;
2058         unsigned toggle;
2059         int bits;
2060
2061         /* may we use this feature? */
2062         if ((mdev->sync_conf.use_rle == 0) ||
2063                 (mdev->agreed_pro_version < 90))
2064                         return 0;
2065
2066         if (c->bit_offset >= c->bm_bits)
2067                 return 0; /* nothing to do. */
2068
2069         /* use at most thus many bytes */
2070         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2071         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2072         /* plain bits covered in this code string */
2073         plain_bits = 0;
2074
2075         /* p->encoding & 0x80 stores whether the first run length is set.
2076          * bit offset is implicit.
2077          * start with toggle == 2 to be able to tell the first iteration */
2078         toggle = 2;
2079
2080         /* see how much plain bits we can stuff into one packet
2081          * using RLE and VLI. */
2082         do {
2083                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2084                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2085                 if (tmp == -1UL)
2086                         tmp = c->bm_bits;
2087                 rl = tmp - c->bit_offset;
2088
2089                 if (toggle == 2) { /* first iteration */
2090                         if (rl == 0) {
2091                                 /* the first checked bit was set,
2092                                  * store start value, */
2093                                 DCBP_set_start(p, 1);
2094                                 /* but skip encoding of zero run length */
2095                                 toggle = !toggle;
2096                                 continue;
2097                         }
2098                         DCBP_set_start(p, 0);
2099                 }
2100
2101                 /* paranoia: catch zero runlength.
2102                  * can only happen if bitmap is modified while we scan it. */
2103                 if (rl == 0) {
2104                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2105                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2106                         return -1;
2107                 }
2108
2109                 bits = vli_encode_bits(&bs, rl);
2110                 if (bits == -ENOBUFS) /* buffer full */
2111                         break;
2112                 if (bits <= 0) {
2113                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2114                         return 0;
2115                 }
2116
2117                 toggle = !toggle;
2118                 plain_bits += rl;
2119                 c->bit_offset = tmp;
2120         } while (c->bit_offset < c->bm_bits);
2121
2122         len = bs.cur.b - p->code + !!bs.cur.bit;
2123
2124         if (plain_bits < (len << 3)) {
2125                 /* incompressible with this method.
2126                  * we need to rewind both word and bit position. */
2127                 c->bit_offset -= plain_bits;
2128                 bm_xfer_ctx_bit_to_word_offset(c);
2129                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2130                 return 0;
2131         }
2132
2133         /* RLE + VLI was able to compress it just fine.
2134          * update c->word_offset. */
2135         bm_xfer_ctx_bit_to_word_offset(c);
2136
2137         /* store pad_bits */
2138         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2139
2140         return len;
2141 }
2142
2143 /**
2144  * send_bitmap_rle_or_plain
2145  *
2146  * Return 0 when done, 1 when another iteration is needed, and a negative error
2147  * code upon failure.
2148  */
2149 static int
2150 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2151                          struct p_header80 *h, struct bm_xfer_ctx *c)
2152 {
2153         struct p_compressed_bm *p = (void*)h;
2154         unsigned long num_words;
2155         int len;
2156         int ok;
2157
2158         len = fill_bitmap_rle_bits(mdev, p, c);
2159
2160         if (len < 0)
2161                 return -EIO;
2162
2163         if (len) {
2164                 DCBP_set_code(p, RLE_VLI_Bits);
2165                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2166                         sizeof(*p) + len, 0);
2167
2168                 c->packets[0]++;
2169                 c->bytes[0] += sizeof(*p) + len;
2170
2171                 if (c->bit_offset >= c->bm_bits)
2172                         len = 0; /* DONE */
2173         } else {
2174                 /* was not compressible.
2175                  * send a buffer full of plain text bits instead. */
2176                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2177                 len = num_words * sizeof(long);
2178                 if (len)
2179                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2180                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2181                                    h, sizeof(struct p_header80) + len, 0);
2182                 c->word_offset += num_words;
2183                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2184
2185                 c->packets[1]++;
2186                 c->bytes[1] += sizeof(struct p_header80) + len;
2187
2188                 if (c->bit_offset > c->bm_bits)
2189                         c->bit_offset = c->bm_bits;
2190         }
2191         if (ok) {
2192                 if (len == 0) {
2193                         INFO_bm_xfer_stats(mdev, "send", c);
2194                         return 0;
2195                 } else
2196                         return 1;
2197         }
2198         return -EIO;
2199 }
2200
2201 /* See the comment at receive_bitmap() */
2202 int _drbd_send_bitmap(struct drbd_conf *mdev)
2203 {
2204         struct bm_xfer_ctx c;
2205         struct p_header80 *p;
2206         int err;
2207
2208         ERR_IF(!mdev->bitmap) return false;
2209
2210         /* maybe we should use some per thread scratch page,
2211          * and allocate that during initial device creation? */
2212         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2213         if (!p) {
2214                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2215                 return false;
2216         }
2217
2218         if (get_ldev(mdev)) {
2219                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2220                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2221                         drbd_bm_set_all(mdev);
2222                         if (drbd_bm_write(mdev)) {
2223                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2224                                  * but otherwise process as per normal - need to tell other
2225                                  * side that a full resync is required! */
2226                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2227                         } else {
2228                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2229                                 drbd_md_sync(mdev);
2230                         }
2231                 }
2232                 put_ldev(mdev);
2233         }
2234
2235         c = (struct bm_xfer_ctx) {
2236                 .bm_bits = drbd_bm_bits(mdev),
2237                 .bm_words = drbd_bm_words(mdev),
2238         };
2239
2240         do {
2241                 err = send_bitmap_rle_or_plain(mdev, p, &c);
2242         } while (err > 0);
2243
2244         free_page((unsigned long) p);
2245         return err == 0;
2246 }
2247
2248 int drbd_send_bitmap(struct drbd_conf *mdev)
2249 {
2250         int err;
2251
2252         if (!drbd_get_data_sock(mdev))
2253                 return -1;
2254         err = !_drbd_send_bitmap(mdev);
2255         drbd_put_data_sock(mdev);
2256         return err;
2257 }
2258
2259 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2260 {
2261         int ok;
2262         struct p_barrier_ack p;
2263
2264         p.barrier  = barrier_nr;
2265         p.set_size = cpu_to_be32(set_size);
2266
2267         if (mdev->state.conn < C_CONNECTED)
2268                 return false;
2269         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2270                         (struct p_header80 *)&p, sizeof(p));
2271         return ok;
2272 }
2273
2274 /**
2275  * _drbd_send_ack() - Sends an ack packet
2276  * @mdev:       DRBD device.
2277  * @cmd:        Packet command code.
2278  * @sector:     sector, needs to be in big endian byte order
2279  * @blksize:    size in byte, needs to be in big endian byte order
2280  * @block_id:   Id, big endian byte order
2281  */
2282 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2283                           u64 sector,
2284                           u32 blksize,
2285                           u64 block_id)
2286 {
2287         int ok;
2288         struct p_block_ack p;
2289
2290         p.sector   = sector;
2291         p.block_id = block_id;
2292         p.blksize  = blksize;
2293         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2294
2295         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2296                 return false;
2297         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2298                                 (struct p_header80 *)&p, sizeof(p));
2299         return ok;
2300 }
2301
2302 /* dp->sector and dp->block_id already/still in network byte order,
2303  * data_size is payload size according to dp->head,
2304  * and may need to be corrected for digest size. */
2305 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2306                      struct p_data *dp, int data_size)
2307 {
2308         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2309                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2310         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2311                               dp->block_id);
2312 }
2313
2314 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2315                      struct p_block_req *rp)
2316 {
2317         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2318 }
2319
2320 /**
2321  * drbd_send_ack() - Sends an ack packet
2322  * @mdev:       DRBD device.
2323  * @cmd:        Packet command code.
2324  * @e:          Epoch entry.
2325  */
2326 int drbd_send_ack(struct drbd_conf *mdev,
2327         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2328 {
2329         return _drbd_send_ack(mdev, cmd,
2330                               cpu_to_be64(e->sector),
2331                               cpu_to_be32(e->size),
2332                               e->block_id);
2333 }
2334
2335 /* This function misuses the block_id field to signal if the blocks
2336  * are is sync or not. */
2337 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2338                      sector_t sector, int blksize, u64 block_id)
2339 {
2340         return _drbd_send_ack(mdev, cmd,
2341                               cpu_to_be64(sector),
2342                               cpu_to_be32(blksize),
2343                               cpu_to_be64(block_id));
2344 }
2345
2346 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2347                        sector_t sector, int size, u64 block_id)
2348 {
2349         int ok;
2350         struct p_block_req p;
2351
2352         p.sector   = cpu_to_be64(sector);
2353         p.block_id = block_id;
2354         p.blksize  = cpu_to_be32(size);
2355
2356         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2357                                 (struct p_header80 *)&p, sizeof(p));
2358         return ok;
2359 }
2360
2361 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2362                             sector_t sector, int size,
2363                             void *digest, int digest_size,
2364                             enum drbd_packets cmd)
2365 {
2366         int ok;
2367         struct p_block_req p;
2368
2369         p.sector   = cpu_to_be64(sector);
2370         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2371         p.blksize  = cpu_to_be32(size);
2372
2373         p.head.magic   = BE_DRBD_MAGIC;
2374         p.head.command = cpu_to_be16(cmd);
2375         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2376
2377         mutex_lock(&mdev->data.mutex);
2378
2379         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2380         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2381
2382         mutex_unlock(&mdev->data.mutex);
2383
2384         return ok;
2385 }
2386
2387 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2388 {
2389         int ok;
2390         struct p_block_req p;
2391
2392         p.sector   = cpu_to_be64(sector);
2393         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2394         p.blksize  = cpu_to_be32(size);
2395
2396         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2397                            (struct p_header80 *)&p, sizeof(p));
2398         return ok;
2399 }
2400
2401 /* called on sndtimeo
2402  * returns false if we should retry,
2403  * true if we think connection is dead
2404  */
2405 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2406 {
2407         int drop_it;
2408         /* long elapsed = (long)(jiffies - mdev->last_received); */
2409
2410         drop_it =   mdev->meta.socket == sock
2411                 || !mdev->asender.task
2412                 || get_t_state(&mdev->asender) != Running
2413                 || mdev->state.conn < C_CONNECTED;
2414
2415         if (drop_it)
2416                 return true;
2417
2418         drop_it = !--mdev->ko_count;
2419         if (!drop_it) {
2420                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2421                        current->comm, current->pid, mdev->ko_count);
2422                 request_ping(mdev);
2423         }
2424
2425         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2426 }
2427
2428 /* The idea of sendpage seems to be to put some kind of reference
2429  * to the page into the skb, and to hand it over to the NIC. In
2430  * this process get_page() gets called.
2431  *
2432  * As soon as the page was really sent over the network put_page()
2433  * gets called by some part of the network layer. [ NIC driver? ]
2434  *
2435  * [ get_page() / put_page() increment/decrement the count. If count
2436  *   reaches 0 the page will be freed. ]
2437  *
2438  * This works nicely with pages from FSs.
2439  * But this means that in protocol A we might signal IO completion too early!
2440  *
2441  * In order not to corrupt data during a resync we must make sure
2442  * that we do not reuse our own buffer pages (EEs) to early, therefore
2443  * we have the net_ee list.
2444  *
2445  * XFS seems to have problems, still, it submits pages with page_count == 0!
2446  * As a workaround, we disable sendpage on pages
2447  * with page_count == 0 or PageSlab.
2448  */
2449 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2450                    int offset, size_t size, unsigned msg_flags)
2451 {
2452         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2453         kunmap(page);
2454         if (sent == size)
2455                 mdev->send_cnt += size>>9;
2456         return sent == size;
2457 }
2458
2459 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2460                     int offset, size_t size, unsigned msg_flags)
2461 {
2462         mm_segment_t oldfs = get_fs();
2463         int sent, ok;
2464         int len = size;
2465
2466         /* e.g. XFS meta- & log-data is in slab pages, which have a
2467          * page_count of 0 and/or have PageSlab() set.
2468          * we cannot use send_page for those, as that does get_page();
2469          * put_page(); and would cause either a VM_BUG directly, or
2470          * __page_cache_release a page that would actually still be referenced
2471          * by someone, leading to some obscure delayed Oops somewhere else. */
2472         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2473                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2474
2475         msg_flags |= MSG_NOSIGNAL;
2476         drbd_update_congested(mdev);
2477         set_fs(KERNEL_DS);
2478         do {
2479                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2480                                                         offset, len,
2481                                                         msg_flags);
2482                 if (sent == -EAGAIN) {
2483                         if (we_should_drop_the_connection(mdev,
2484                                                           mdev->data.socket))
2485                                 break;
2486                         else
2487                                 continue;
2488                 }
2489                 if (sent <= 0) {
2490                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2491                              __func__, (int)size, len, sent);
2492                         break;
2493                 }
2494                 len    -= sent;
2495                 offset += sent;
2496         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2497         set_fs(oldfs);
2498         clear_bit(NET_CONGESTED, &mdev->flags);
2499
2500         ok = (len == 0);
2501         if (likely(ok))
2502                 mdev->send_cnt += size>>9;
2503         return ok;
2504 }
2505
2506 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2507 {
2508         struct bio_vec *bvec;
2509         int i;
2510         /* hint all but last page with MSG_MORE */
2511         __bio_for_each_segment(bvec, bio, i, 0) {
2512                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2513                                      bvec->bv_offset, bvec->bv_len,
2514                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2515                         return 0;
2516         }
2517         return 1;
2518 }
2519
2520 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2521 {
2522         struct bio_vec *bvec;
2523         int i;
2524         /* hint all but last page with MSG_MORE */
2525         __bio_for_each_segment(bvec, bio, i, 0) {
2526                 if (!_drbd_send_page(mdev, bvec->bv_page,
2527                                      bvec->bv_offset, bvec->bv_len,
2528                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2529                         return 0;
2530         }
2531         return 1;
2532 }
2533
2534 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2535 {
2536         struct page *page = e->pages;
2537         unsigned len = e->size;
2538         /* hint all but last page with MSG_MORE */
2539         page_chain_for_each(page) {
2540                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2541                 if (!_drbd_send_page(mdev, page, 0, l,
2542                                 page_chain_next(page) ? MSG_MORE : 0))
2543                         return 0;
2544                 len -= l;
2545         }
2546         return 1;
2547 }
2548
2549 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2550 {
2551         if (mdev->agreed_pro_version >= 95)
2552                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2553                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2554                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2555                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2556         else
2557                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2558 }
2559
2560 /* Used to send write requests
2561  * R_PRIMARY -> Peer    (P_DATA)
2562  */
2563 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2564 {
2565         int ok = 1;
2566         struct p_data p;
2567         unsigned int dp_flags = 0;
2568         void *dgb;
2569         int dgs;
2570
2571         if (!drbd_get_data_sock(mdev))
2572                 return 0;
2573
2574         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2575                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2576
2577         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2578                 p.head.h80.magic   = BE_DRBD_MAGIC;
2579                 p.head.h80.command = cpu_to_be16(P_DATA);
2580                 p.head.h80.length  =
2581                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2582         } else {
2583                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2584                 p.head.h95.command = cpu_to_be16(P_DATA);
2585                 p.head.h95.length  =
2586                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2587         }
2588
2589         p.sector   = cpu_to_be64(req->sector);
2590         p.block_id = (unsigned long)req;
2591         p.seq_num  = cpu_to_be32(req->seq_num =
2592                                  atomic_add_return(1, &mdev->packet_seq));
2593
2594         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2595
2596         if (mdev->state.conn >= C_SYNC_SOURCE &&
2597             mdev->state.conn <= C_PAUSED_SYNC_T)
2598                 dp_flags |= DP_MAY_SET_IN_SYNC;
2599
2600         p.dp_flags = cpu_to_be32(dp_flags);
2601         set_bit(UNPLUG_REMOTE, &mdev->flags);
2602         ok = (sizeof(p) ==
2603                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2604         if (ok && dgs) {
2605                 dgb = mdev->int_dig_out;
2606                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2607                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2608         }
2609         if (ok) {
2610                 /* For protocol A, we have to memcpy the payload into
2611                  * socket buffers, as we may complete right away
2612                  * as soon as we handed it over to tcp, at which point the data
2613                  * pages may become invalid.
2614                  *
2615                  * For data-integrity enabled, we copy it as well, so we can be
2616                  * sure that even if the bio pages may still be modified, it
2617                  * won't change the data on the wire, thus if the digest checks
2618                  * out ok after sending on this side, but does not fit on the
2619                  * receiving side, we sure have detected corruption elsewhere.
2620                  */
2621                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2622                         ok = _drbd_send_bio(mdev, req->master_bio);
2623                 else
2624                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2625
2626                 /* double check digest, sometimes buffers have been modified in flight. */
2627                 if (dgs > 0 && dgs <= 64) {
2628                         /* 64 byte, 512 bit, is the larges digest size
2629                          * currently supported in kernel crypto. */
2630                         unsigned char digest[64];
2631                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2632                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2633                                 dev_warn(DEV,
2634                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2635                                         (unsigned long long)req->sector, req->size);
2636                         }
2637                 } /* else if (dgs > 64) {
2638                      ... Be noisy about digest too large ...
2639                 } */
2640         }
2641
2642         drbd_put_data_sock(mdev);
2643
2644         return ok;
2645 }
2646
2647 /* answer packet, used to send data back for read requests:
2648  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2649  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2650  */
2651 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2652                     struct drbd_epoch_entry *e)
2653 {
2654         int ok;
2655         struct p_data p;
2656         void *dgb;
2657         int dgs;
2658
2659         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2660                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2661
2662         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2663                 p.head.h80.magic   = BE_DRBD_MAGIC;
2664                 p.head.h80.command = cpu_to_be16(cmd);
2665                 p.head.h80.length  =
2666                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2667         } else {
2668                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2669                 p.head.h95.command = cpu_to_be16(cmd);
2670                 p.head.h95.length  =
2671                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2672         }
2673
2674         p.sector   = cpu_to_be64(e->sector);
2675         p.block_id = e->block_id;
2676         /* p.seq_num  = 0;    No sequence numbers here.. */
2677
2678         /* Only called by our kernel thread.
2679          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2680          * in response to admin command or module unload.
2681          */
2682         if (!drbd_get_data_sock(mdev))
2683                 return 0;
2684
2685         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2686         if (ok && dgs) {
2687                 dgb = mdev->int_dig_out;
2688                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2689                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2690         }
2691         if (ok)
2692                 ok = _drbd_send_zc_ee(mdev, e);
2693
2694         drbd_put_data_sock(mdev);
2695
2696         return ok;
2697 }
2698
2699 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2700 {
2701         struct p_block_desc p;
2702
2703         p.sector  = cpu_to_be64(req->sector);
2704         p.blksize = cpu_to_be32(req->size);
2705
2706         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2707 }
2708
2709 /*
2710   drbd_send distinguishes two cases:
2711
2712   Packets sent via the data socket "sock"
2713   and packets sent via the meta data socket "msock"
2714
2715                     sock                      msock
2716   -----------------+-------------------------+------------------------------
2717   timeout           conf.timeout / 2          conf.timeout / 2
2718   timeout action    send a ping via msock     Abort communication
2719                                               and close all sockets
2720 */
2721
2722 /*
2723  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2724  */
2725 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2726               void *buf, size_t size, unsigned msg_flags)
2727 {
2728         struct kvec iov;
2729         struct msghdr msg;
2730         int rv, sent = 0;
2731
2732         if (!sock)
2733                 return -1000;
2734
2735         /* THINK  if (signal_pending) return ... ? */
2736
2737         iov.iov_base = buf;
2738         iov.iov_len  = size;
2739
2740         msg.msg_name       = NULL;
2741         msg.msg_namelen    = 0;
2742         msg.msg_control    = NULL;
2743         msg.msg_controllen = 0;
2744         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2745
2746         if (sock == mdev->data.socket) {
2747                 mdev->ko_count = mdev->net_conf->ko_count;
2748                 drbd_update_congested(mdev);
2749         }
2750         do {
2751                 /* STRANGE
2752                  * tcp_sendmsg does _not_ use its size parameter at all ?
2753                  *
2754                  * -EAGAIN on timeout, -EINTR on signal.
2755                  */
2756 /* THINK
2757  * do we need to block DRBD_SIG if sock == &meta.socket ??
2758  * otherwise wake_asender() might interrupt some send_*Ack !
2759  */
2760                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2761                 if (rv == -EAGAIN) {
2762                         if (we_should_drop_the_connection(mdev, sock))
2763                                 break;
2764                         else
2765                                 continue;
2766                 }
2767                 D_ASSERT(rv != 0);
2768                 if (rv == -EINTR) {
2769                         flush_signals(current);
2770                         rv = 0;
2771                 }
2772                 if (rv < 0)
2773                         break;
2774                 sent += rv;
2775                 iov.iov_base += rv;
2776                 iov.iov_len  -= rv;
2777         } while (sent < size);
2778
2779         if (sock == mdev->data.socket)
2780                 clear_bit(NET_CONGESTED, &mdev->flags);
2781
2782         if (rv <= 0) {
2783                 if (rv != -EAGAIN) {
2784                         dev_err(DEV, "%s_sendmsg returned %d\n",
2785                             sock == mdev->meta.socket ? "msock" : "sock",
2786                             rv);
2787                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2788                 } else
2789                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2790         }
2791
2792         return sent;
2793 }
2794
2795 static int drbd_open(struct block_device *bdev, fmode_t mode)
2796 {
2797         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2798         unsigned long flags;
2799         int rv = 0;
2800
2801         mutex_lock(&drbd_main_mutex);
2802         spin_lock_irqsave(&mdev->req_lock, flags);
2803         /* to have a stable mdev->state.role
2804          * and no race with updating open_cnt */
2805
2806         if (mdev->state.role != R_PRIMARY) {
2807                 if (mode & FMODE_WRITE)
2808                         rv = -EROFS;
2809                 else if (!allow_oos)
2810                         rv = -EMEDIUMTYPE;
2811         }
2812
2813         if (!rv)
2814                 mdev->open_cnt++;
2815         spin_unlock_irqrestore(&mdev->req_lock, flags);
2816         mutex_unlock(&drbd_main_mutex);
2817
2818         return rv;
2819 }
2820
2821 static int drbd_release(struct gendisk *gd, fmode_t mode)
2822 {
2823         struct drbd_conf *mdev = gd->private_data;
2824         mutex_lock(&drbd_main_mutex);
2825         mdev->open_cnt--;
2826         mutex_unlock(&drbd_main_mutex);
2827         return 0;
2828 }
2829
2830 static void drbd_set_defaults(struct drbd_conf *mdev)
2831 {
2832         /* This way we get a compile error when sync_conf grows,
2833            and we forgot to initialize it here */
2834         mdev->sync_conf = (struct syncer_conf) {
2835                 /* .rate = */           DRBD_RATE_DEF,
2836                 /* .after = */          DRBD_AFTER_DEF,
2837                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2838                 /* .verify_alg = */     {}, 0,
2839                 /* .cpu_mask = */       {}, 0,
2840                 /* .csums_alg = */      {}, 0,
2841                 /* .use_rle = */        0,
2842                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2843                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2844                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2845                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2846                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2847                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2848         };
2849
2850         /* Have to use that way, because the layout differs between
2851            big endian and little endian */
2852         mdev->state = (union drbd_state) {
2853                 { .role = R_SECONDARY,
2854                   .peer = R_UNKNOWN,
2855                   .conn = C_STANDALONE,
2856                   .disk = D_DISKLESS,
2857                   .pdsk = D_UNKNOWN,
2858                   .susp = 0,
2859                   .susp_nod = 0,
2860                   .susp_fen = 0
2861                 } };
2862 }
2863
2864 void drbd_init_set_defaults(struct drbd_conf *mdev)
2865 {
2866         /* the memset(,0,) did most of this.
2867          * note: only assignments, no allocation in here */
2868
2869         drbd_set_defaults(mdev);
2870
2871         atomic_set(&mdev->ap_bio_cnt, 0);
2872         atomic_set(&mdev->ap_pending_cnt, 0);
2873         atomic_set(&mdev->rs_pending_cnt, 0);
2874         atomic_set(&mdev->unacked_cnt, 0);
2875         atomic_set(&mdev->local_cnt, 0);
2876         atomic_set(&mdev->net_cnt, 0);
2877         atomic_set(&mdev->packet_seq, 0);
2878         atomic_set(&mdev->pp_in_use, 0);
2879         atomic_set(&mdev->pp_in_use_by_net, 0);
2880         atomic_set(&mdev->rs_sect_in, 0);
2881         atomic_set(&mdev->rs_sect_ev, 0);
2882         atomic_set(&mdev->ap_in_flight, 0);
2883
2884         mutex_init(&mdev->md_io_mutex);
2885         mutex_init(&mdev->data.mutex);
2886         mutex_init(&mdev->meta.mutex);
2887         sema_init(&mdev->data.work.s, 0);
2888         sema_init(&mdev->meta.work.s, 0);
2889         mutex_init(&mdev->state_mutex);
2890
2891         spin_lock_init(&mdev->data.work.q_lock);
2892         spin_lock_init(&mdev->meta.work.q_lock);
2893
2894         spin_lock_init(&mdev->al_lock);
2895         spin_lock_init(&mdev->req_lock);
2896         spin_lock_init(&mdev->peer_seq_lock);
2897         spin_lock_init(&mdev->epoch_lock);
2898
2899         INIT_LIST_HEAD(&mdev->active_ee);
2900         INIT_LIST_HEAD(&mdev->sync_ee);
2901         INIT_LIST_HEAD(&mdev->done_ee);
2902         INIT_LIST_HEAD(&mdev->read_ee);
2903         INIT_LIST_HEAD(&mdev->net_ee);
2904         INIT_LIST_HEAD(&mdev->resync_reads);
2905         INIT_LIST_HEAD(&mdev->data.work.q);
2906         INIT_LIST_HEAD(&mdev->meta.work.q);
2907         INIT_LIST_HEAD(&mdev->resync_work.list);
2908         INIT_LIST_HEAD(&mdev->unplug_work.list);
2909         INIT_LIST_HEAD(&mdev->go_diskless.list);
2910         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2911         INIT_LIST_HEAD(&mdev->start_resync_work.list);
2912         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2913
2914         mdev->resync_work.cb  = w_resync_inactive;
2915         mdev->unplug_work.cb  = w_send_write_hint;
2916         mdev->go_diskless.cb  = w_go_diskless;
2917         mdev->md_sync_work.cb = w_md_sync;
2918         mdev->bm_io_work.w.cb = w_bitmap_io;
2919         init_timer(&mdev->resync_timer);
2920         init_timer(&mdev->md_sync_timer);
2921         mdev->resync_timer.function = resync_timer_fn;
2922         mdev->resync_timer.data = (unsigned long) mdev;
2923         mdev->md_sync_timer.function = md_sync_timer_fn;
2924         mdev->md_sync_timer.data = (unsigned long) mdev;
2925
2926         init_waitqueue_head(&mdev->misc_wait);
2927         init_waitqueue_head(&mdev->state_wait);
2928         init_waitqueue_head(&mdev->net_cnt_wait);
2929         init_waitqueue_head(&mdev->ee_wait);
2930         init_waitqueue_head(&mdev->al_wait);
2931         init_waitqueue_head(&mdev->seq_wait);
2932
2933         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2934         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2935         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2936
2937         mdev->agreed_pro_version = PRO_VERSION_MAX;
2938         mdev->write_ordering = WO_bdev_flush;
2939         mdev->resync_wenr = LC_FREE;
2940 }
2941
2942 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2943 {
2944         int i;
2945         if (mdev->receiver.t_state != None)
2946                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2947                                 mdev->receiver.t_state);
2948
2949         /* no need to lock it, I'm the only thread alive */
2950         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2951                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2952         mdev->al_writ_cnt  =
2953         mdev->bm_writ_cnt  =
2954         mdev->read_cnt     =
2955         mdev->recv_cnt     =
2956         mdev->send_cnt     =
2957         mdev->writ_cnt     =
2958         mdev->p_size       =
2959         mdev->rs_start     =
2960         mdev->rs_total     =
2961         mdev->rs_failed    = 0;
2962         mdev->rs_last_events = 0;
2963         mdev->rs_last_sect_ev = 0;
2964         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2965                 mdev->rs_mark_left[i] = 0;
2966                 mdev->rs_mark_time[i] = 0;
2967         }
2968         D_ASSERT(mdev->net_conf == NULL);
2969
2970         drbd_set_my_capacity(mdev, 0);
2971         if (mdev->bitmap) {
2972                 /* maybe never allocated. */
2973                 drbd_bm_resize(mdev, 0, 1);
2974                 drbd_bm_cleanup(mdev);
2975         }
2976
2977         drbd_free_resources(mdev);
2978         clear_bit(AL_SUSPENDED, &mdev->flags);
2979
2980         /*
2981          * currently we drbd_init_ee only on module load, so
2982          * we may do drbd_release_ee only on module unload!
2983          */
2984         D_ASSERT(list_empty(&mdev->active_ee));
2985         D_ASSERT(list_empty(&mdev->sync_ee));
2986         D_ASSERT(list_empty(&mdev->done_ee));
2987         D_ASSERT(list_empty(&mdev->read_ee));
2988         D_ASSERT(list_empty(&mdev->net_ee));
2989         D_ASSERT(list_empty(&mdev->resync_reads));
2990         D_ASSERT(list_empty(&mdev->data.work.q));
2991         D_ASSERT(list_empty(&mdev->meta.work.q));
2992         D_ASSERT(list_empty(&mdev->resync_work.list));
2993         D_ASSERT(list_empty(&mdev->unplug_work.list));
2994         D_ASSERT(list_empty(&mdev->go_diskless.list));
2995 }
2996
2997
2998 static void drbd_destroy_mempools(void)
2999 {
3000         struct page *page;
3001
3002         while (drbd_pp_pool) {
3003                 page = drbd_pp_pool;
3004                 drbd_pp_pool = (struct page *)page_private(page);
3005                 __free_page(page);
3006                 drbd_pp_vacant--;
3007         }
3008
3009         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3010
3011         if (drbd_ee_mempool)
3012                 mempool_destroy(drbd_ee_mempool);
3013         if (drbd_request_mempool)
3014                 mempool_destroy(drbd_request_mempool);
3015         if (drbd_ee_cache)
3016                 kmem_cache_destroy(drbd_ee_cache);
3017         if (drbd_request_cache)
3018                 kmem_cache_destroy(drbd_request_cache);
3019         if (drbd_bm_ext_cache)
3020                 kmem_cache_destroy(drbd_bm_ext_cache);
3021         if (drbd_al_ext_cache)
3022                 kmem_cache_destroy(drbd_al_ext_cache);
3023
3024         drbd_ee_mempool      = NULL;
3025         drbd_request_mempool = NULL;
3026         drbd_ee_cache        = NULL;
3027         drbd_request_cache   = NULL;
3028         drbd_bm_ext_cache    = NULL;
3029         drbd_al_ext_cache    = NULL;
3030
3031         return;
3032 }
3033
3034 static int drbd_create_mempools(void)
3035 {
3036         struct page *page;
3037         const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3038         int i;
3039
3040         /* prepare our caches and mempools */
3041         drbd_request_mempool = NULL;
3042         drbd_ee_cache        = NULL;
3043         drbd_request_cache   = NULL;
3044         drbd_bm_ext_cache    = NULL;
3045         drbd_al_ext_cache    = NULL;
3046         drbd_pp_pool         = NULL;
3047
3048         /* caches */
3049         drbd_request_cache = kmem_cache_create(
3050                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3051         if (drbd_request_cache == NULL)
3052                 goto Enomem;
3053
3054         drbd_ee_cache = kmem_cache_create(
3055                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3056         if (drbd_ee_cache == NULL)
3057                 goto Enomem;
3058
3059         drbd_bm_ext_cache = kmem_cache_create(
3060                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3061         if (drbd_bm_ext_cache == NULL)
3062                 goto Enomem;
3063
3064         drbd_al_ext_cache = kmem_cache_create(
3065                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3066         if (drbd_al_ext_cache == NULL)
3067                 goto Enomem;
3068
3069         /* mempools */
3070         drbd_request_mempool = mempool_create(number,
3071                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3072         if (drbd_request_mempool == NULL)
3073                 goto Enomem;
3074
3075         drbd_ee_mempool = mempool_create(number,
3076                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3077         if (drbd_ee_mempool == NULL)
3078                 goto Enomem;
3079
3080         /* drbd's page pool */
3081         spin_lock_init(&drbd_pp_lock);
3082
3083         for (i = 0; i < number; i++) {
3084                 page = alloc_page(GFP_HIGHUSER);
3085                 if (!page)
3086                         goto Enomem;
3087                 set_page_private(page, (unsigned long)drbd_pp_pool);
3088                 drbd_pp_pool = page;
3089         }
3090         drbd_pp_vacant = number;
3091
3092         return 0;
3093
3094 Enomem:
3095         drbd_destroy_mempools(); /* in case we allocated some */
3096         return -ENOMEM;
3097 }
3098
3099 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3100         void *unused)
3101 {
3102         /* just so we have it.  you never know what interesting things we
3103          * might want to do here some day...
3104          */
3105
3106         return NOTIFY_DONE;
3107 }
3108
3109 static struct notifier_block drbd_notifier = {
3110         .notifier_call = drbd_notify_sys,
3111 };
3112
3113 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3114 {
3115         int rr;
3116
3117         rr = drbd_release_ee(mdev, &mdev->active_ee);
3118         if (rr)
3119                 dev_err(DEV, "%d EEs in active list found!\n", rr);
3120
3121         rr = drbd_release_ee(mdev, &mdev->sync_ee);
3122         if (rr)
3123                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3124
3125         rr = drbd_release_ee(mdev, &mdev->read_ee);
3126         if (rr)
3127                 dev_err(DEV, "%d EEs in read list found!\n", rr);
3128
3129         rr = drbd_release_ee(mdev, &mdev->done_ee);
3130         if (rr)
3131                 dev_err(DEV, "%d EEs in done list found!\n", rr);
3132
3133         rr = drbd_release_ee(mdev, &mdev->net_ee);
3134         if (rr)
3135                 dev_err(DEV, "%d EEs in net list found!\n", rr);
3136 }
3137
3138 /* caution. no locking.
3139  * currently only used from module cleanup code. */
3140 static void drbd_delete_device(unsigned int minor)
3141 {
3142         struct drbd_conf *mdev = minor_to_mdev(minor);
3143
3144         if (!mdev)
3145                 return;
3146
3147         /* paranoia asserts */
3148         if (mdev->open_cnt != 0)
3149                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3150                                 __FILE__ , __LINE__);
3151
3152         ERR_IF (!list_empty(&mdev->data.work.q)) {
3153                 struct list_head *lp;
3154                 list_for_each(lp, &mdev->data.work.q) {
3155                         dev_err(DEV, "lp = %p\n", lp);
3156                 }
3157         };
3158         /* end paranoia asserts */
3159
3160         del_gendisk(mdev->vdisk);
3161
3162         /* cleanup stuff that may have been allocated during
3163          * device (re-)configuration or state changes */
3164
3165         if (mdev->this_bdev)
3166                 bdput(mdev->this_bdev);
3167
3168         drbd_free_resources(mdev);
3169
3170         drbd_release_ee_lists(mdev);
3171
3172         /* should be free'd on disconnect? */
3173         kfree(mdev->ee_hash);
3174         /*
3175         mdev->ee_hash_s = 0;
3176         mdev->ee_hash = NULL;
3177         */
3178
3179         lc_destroy(mdev->act_log);
3180         lc_destroy(mdev->resync);
3181
3182         kfree(mdev->p_uuid);
3183         /* mdev->p_uuid = NULL; */
3184
3185         kfree(mdev->int_dig_out);
3186         kfree(mdev->int_dig_in);
3187         kfree(mdev->int_dig_vv);
3188
3189         /* cleanup the rest that has been
3190          * allocated from drbd_new_device
3191          * and actually free the mdev itself */
3192         drbd_free_mdev(mdev);
3193 }
3194
3195 static void drbd_cleanup(void)
3196 {
3197         unsigned int i;
3198
3199         unregister_reboot_notifier(&drbd_notifier);
3200
3201         /* first remove proc,
3202          * drbdsetup uses it's presence to detect
3203          * whether DRBD is loaded.
3204          * If we would get stuck in proc removal,
3205          * but have netlink already deregistered,
3206          * some drbdsetup commands may wait forever
3207          * for an answer.
3208          */
3209         if (drbd_proc)
3210                 remove_proc_entry("drbd", NULL);
3211
3212         drbd_nl_cleanup();
3213
3214         if (minor_table) {
3215                 i = minor_count;
3216                 while (i--)
3217                         drbd_delete_device(i);
3218                 drbd_destroy_mempools();
3219         }
3220
3221         kfree(minor_table);
3222
3223         unregister_blkdev(DRBD_MAJOR, "drbd");
3224
3225         printk(KERN_INFO "drbd: module cleanup done.\n");
3226 }
3227
3228 /**
3229  * drbd_congested() - Callback for pdflush
3230  * @congested_data:     User data
3231  * @bdi_bits:           Bits pdflush is currently interested in
3232  *
3233  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3234  */
3235 static int drbd_congested(void *congested_data, int bdi_bits)
3236 {
3237         struct drbd_conf *mdev = congested_data;
3238         struct request_queue *q;
3239         char reason = '-';
3240         int r = 0;
3241
3242         if (!__inc_ap_bio_cond(mdev)) {
3243                 /* DRBD has frozen IO */
3244                 r = bdi_bits;
3245                 reason = 'd';
3246                 goto out;
3247         }
3248
3249         if (get_ldev(mdev)) {
3250                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3251                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3252                 put_ldev(mdev);
3253                 if (r)
3254                         reason = 'b';
3255         }
3256
3257         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3258                 r |= (1 << BDI_async_congested);
3259                 reason = reason == 'b' ? 'a' : 'n';
3260         }
3261
3262 out:
3263         mdev->congestion_reason = reason;
3264         return r;
3265 }
3266
3267 struct drbd_conf *drbd_new_device(unsigned int minor)
3268 {
3269         struct drbd_conf *mdev;
3270         struct gendisk *disk;
3271         struct request_queue *q;
3272
3273         /* GFP_KERNEL, we are outside of all write-out paths */
3274         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3275         if (!mdev)
3276                 return NULL;
3277         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3278                 goto out_no_cpumask;
3279
3280         mdev->minor = minor;
3281
3282         drbd_init_set_defaults(mdev);
3283
3284         q = blk_alloc_queue(GFP_KERNEL);
3285         if (!q)
3286                 goto out_no_q;
3287         mdev->rq_queue = q;
3288         q->queuedata   = mdev;
3289
3290         disk = alloc_disk(1);
3291         if (!disk)
3292                 goto out_no_disk;
3293         mdev->vdisk = disk;
3294
3295         set_disk_ro(disk, true);
3296
3297         disk->queue = q;
3298         disk->major = DRBD_MAJOR;
3299         disk->first_minor = minor;
3300         disk->fops = &drbd_ops;
3301         sprintf(disk->disk_name, "drbd%d", minor);
3302         disk->private_data = mdev;
3303
3304         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3305         /* we have no partitions. we contain only ourselves. */
3306         mdev->this_bdev->bd_contains = mdev->this_bdev;
3307
3308         q->backing_dev_info.congested_fn = drbd_congested;
3309         q->backing_dev_info.congested_data = mdev;
3310
3311         blk_queue_make_request(q, drbd_make_request);
3312         blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3313         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3314         blk_queue_merge_bvec(q, drbd_merge_bvec);
3315         q->queue_lock = &mdev->req_lock;
3316
3317         mdev->md_io_page = alloc_page(GFP_KERNEL);
3318         if (!mdev->md_io_page)
3319                 goto out_no_io_page;
3320
3321         if (drbd_bm_init(mdev))
3322                 goto out_no_bitmap;
3323         /* no need to lock access, we are still initializing this minor device. */
3324         if (!tl_init(mdev))
3325                 goto out_no_tl;
3326
3327         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3328         if (!mdev->app_reads_hash)
3329                 goto out_no_app_reads;
3330
3331         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3332         if (!mdev->current_epoch)
3333                 goto out_no_epoch;
3334
3335         INIT_LIST_HEAD(&mdev->current_epoch->list);
3336         mdev->epochs = 1;
3337
3338         return mdev;
3339
3340 /* out_whatever_else:
3341         kfree(mdev->current_epoch); */
3342 out_no_epoch:
3343         kfree(mdev->app_reads_hash);
3344 out_no_app_reads:
3345         tl_cleanup(mdev);
3346 out_no_tl:
3347         drbd_bm_cleanup(mdev);
3348 out_no_bitmap:
3349         __free_page(mdev->md_io_page);
3350 out_no_io_page:
3351         put_disk(disk);
3352 out_no_disk:
3353         blk_cleanup_queue(q);
3354 out_no_q:
3355         free_cpumask_var(mdev->cpu_mask);
3356 out_no_cpumask:
3357         kfree(mdev);
3358         return NULL;
3359 }
3360
3361 /* counterpart of drbd_new_device.
3362  * last part of drbd_delete_device. */
3363 void drbd_free_mdev(struct drbd_conf *mdev)
3364 {
3365         kfree(mdev->current_epoch);
3366         kfree(mdev->app_reads_hash);
3367         tl_cleanup(mdev);
3368         if (mdev->bitmap) /* should no longer be there. */
3369                 drbd_bm_cleanup(mdev);
3370         __free_page(mdev->md_io_page);
3371         put_disk(mdev->vdisk);
3372         blk_cleanup_queue(mdev->rq_queue);
3373         free_cpumask_var(mdev->cpu_mask);
3374         drbd_free_tl_hash(mdev);
3375         kfree(mdev);
3376 }
3377
3378
3379 int __init drbd_init(void)
3380 {
3381         int err;
3382
3383         if (sizeof(struct p_handshake) != 80) {
3384                 printk(KERN_ERR
3385                        "drbd: never change the size or layout "
3386                        "of the HandShake packet.\n");
3387                 return -EINVAL;
3388         }
3389
3390         if (1 > minor_count || minor_count > 255) {
3391                 printk(KERN_ERR
3392                         "drbd: invalid minor_count (%d)\n", minor_count);
3393 #ifdef MODULE
3394                 return -EINVAL;
3395 #else
3396                 minor_count = 8;
3397 #endif
3398         }
3399
3400         err = drbd_nl_init();
3401         if (err)
3402                 return err;
3403
3404         err = register_blkdev(DRBD_MAJOR, "drbd");
3405         if (err) {
3406                 printk(KERN_ERR
3407                        "drbd: unable to register block device major %d\n",
3408                        DRBD_MAJOR);
3409                 return err;
3410         }
3411
3412         register_reboot_notifier(&drbd_notifier);
3413
3414         /*
3415          * allocate all necessary structs
3416          */
3417         err = -ENOMEM;
3418
3419         init_waitqueue_head(&drbd_pp_wait);
3420
3421         drbd_proc = NULL; /* play safe for drbd_cleanup */
3422         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3423                                 GFP_KERNEL);
3424         if (!minor_table)
3425                 goto Enomem;
3426
3427         err = drbd_create_mempools();
3428         if (err)
3429                 goto Enomem;
3430
3431         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3432         if (!drbd_proc) {
3433                 printk(KERN_ERR "drbd: unable to register proc file\n");
3434                 goto Enomem;
3435         }
3436
3437         rwlock_init(&global_state_lock);
3438
3439         printk(KERN_INFO "drbd: initialized. "
3440                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3441                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3442         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3443         printk(KERN_INFO "drbd: registered as block device major %d\n",
3444                 DRBD_MAJOR);
3445         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3446
3447         return 0; /* Success! */
3448
3449 Enomem:
3450         drbd_cleanup();
3451         if (err == -ENOMEM)
3452                 /* currently always the case */
3453                 printk(KERN_ERR "drbd: ran out of memory\n");
3454         else
3455                 printk(KERN_ERR "drbd: initialization failure\n");
3456         return err;
3457 }
3458
3459 void drbd_free_bc(struct drbd_backing_dev *ldev)
3460 {
3461         if (ldev == NULL)
3462                 return;
3463
3464         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3465         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3466
3467         kfree(ldev);
3468 }
3469
3470 void drbd_free_sock(struct drbd_conf *mdev)
3471 {
3472         if (mdev->data.socket) {
3473                 mutex_lock(&mdev->data.mutex);
3474                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3475                 sock_release(mdev->data.socket);
3476                 mdev->data.socket = NULL;
3477                 mutex_unlock(&mdev->data.mutex);
3478         }
3479         if (mdev->meta.socket) {
3480                 mutex_lock(&mdev->meta.mutex);
3481                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3482                 sock_release(mdev->meta.socket);
3483                 mdev->meta.socket = NULL;
3484                 mutex_unlock(&mdev->meta.mutex);
3485         }
3486 }
3487
3488
3489 void drbd_free_resources(struct drbd_conf *mdev)
3490 {
3491         crypto_free_hash(mdev->csums_tfm);
3492         mdev->csums_tfm = NULL;
3493         crypto_free_hash(mdev->verify_tfm);
3494         mdev->verify_tfm = NULL;
3495         crypto_free_hash(mdev->cram_hmac_tfm);
3496         mdev->cram_hmac_tfm = NULL;
3497         crypto_free_hash(mdev->integrity_w_tfm);
3498         mdev->integrity_w_tfm = NULL;
3499         crypto_free_hash(mdev->integrity_r_tfm);
3500         mdev->integrity_r_tfm = NULL;
3501
3502         drbd_free_sock(mdev);
3503
3504         __no_warn(local,
3505                   drbd_free_bc(mdev->ldev);
3506                   mdev->ldev = NULL;);
3507 }
3508
3509 /* meta data management */
3510
3511 struct meta_data_on_disk {
3512         u64 la_size;           /* last agreed size. */
3513         u64 uuid[UI_SIZE];   /* UUIDs. */
3514         u64 device_uuid;
3515         u64 reserved_u64_1;
3516         u32 flags;             /* MDF */
3517         u32 magic;
3518         u32 md_size_sect;
3519         u32 al_offset;         /* offset to this block */
3520         u32 al_nr_extents;     /* important for restoring the AL */
3521               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3522         u32 bm_offset;         /* offset to the bitmap, from here */
3523         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3524         u32 reserved_u32[4];
3525
3526 } __packed;
3527
3528 /**
3529  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3530  * @mdev:       DRBD device.
3531  */
3532 void drbd_md_sync(struct drbd_conf *mdev)
3533 {
3534         struct meta_data_on_disk *buffer;
3535         sector_t sector;
3536         int i;
3537
3538         del_timer(&mdev->md_sync_timer);
3539         /* timer may be rearmed by drbd_md_mark_dirty() now. */
3540         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3541                 return;
3542
3543         /* We use here D_FAILED and not D_ATTACHING because we try to write
3544          * metadata even if we detach due to a disk failure! */
3545         if (!get_ldev_if_state(mdev, D_FAILED))
3546                 return;
3547
3548         mutex_lock(&mdev->md_io_mutex);
3549         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3550         memset(buffer, 0, 512);
3551
3552         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3553         for (i = UI_CURRENT; i < UI_SIZE; i++)
3554                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3555         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3556         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3557
3558         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3559         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3560         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3561         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3562         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3563
3564         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3565
3566         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3567         sector = mdev->ldev->md.md_offset;
3568
3569         if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3570                 /* this was a try anyways ... */
3571                 dev_err(DEV, "meta data update failed!\n");
3572                 drbd_chk_io_error(mdev, 1, true);
3573         }
3574
3575         /* Update mdev->ldev->md.la_size_sect,
3576          * since we updated it on metadata. */
3577         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3578
3579         mutex_unlock(&mdev->md_io_mutex);
3580         put_ldev(mdev);
3581 }
3582
3583 /**
3584  * drbd_md_read() - Reads in the meta data super block
3585  * @mdev:       DRBD device.
3586  * @bdev:       Device from which the meta data should be read in.
3587  *
3588  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3589  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3590  */
3591 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3592 {
3593         struct meta_data_on_disk *buffer;
3594         int i, rv = NO_ERROR;
3595
3596         if (!get_ldev_if_state(mdev, D_ATTACHING))
3597                 return ERR_IO_MD_DISK;
3598
3599         mutex_lock(&mdev->md_io_mutex);
3600         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3601
3602         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3603                 /* NOTE: cant do normal error processing here as this is
3604                    called BEFORE disk is attached */
3605                 dev_err(DEV, "Error while reading metadata.\n");
3606                 rv = ERR_IO_MD_DISK;
3607                 goto err;
3608         }
3609
3610         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3611                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3612                 rv = ERR_MD_INVALID;
3613                 goto err;
3614         }
3615         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3616                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3617                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3618                 rv = ERR_MD_INVALID;
3619                 goto err;
3620         }
3621         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3622                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3623                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3624                 rv = ERR_MD_INVALID;
3625                 goto err;
3626         }
3627         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3628                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3629                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3630                 rv = ERR_MD_INVALID;
3631                 goto err;
3632         }
3633
3634         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3635                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3636                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3637                 rv = ERR_MD_INVALID;
3638                 goto err;
3639         }
3640
3641         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3642         for (i = UI_CURRENT; i < UI_SIZE; i++)
3643                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3644         bdev->md.flags = be32_to_cpu(buffer->flags);
3645         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3646         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3647
3648         if (mdev->sync_conf.al_extents < 7)
3649                 mdev->sync_conf.al_extents = 127;
3650
3651  err:
3652         mutex_unlock(&mdev->md_io_mutex);
3653         put_ldev(mdev);
3654
3655         return rv;
3656 }
3657
3658 static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3659 {
3660         static char *uuid_str[UI_EXTENDED_SIZE] = {
3661                 [UI_CURRENT] = "CURRENT",
3662                 [UI_BITMAP] = "BITMAP",
3663                 [UI_HISTORY_START] = "HISTORY_START",
3664                 [UI_HISTORY_END] = "HISTORY_END",
3665                 [UI_SIZE] = "SIZE",
3666                 [UI_FLAGS] = "FLAGS",
3667         };
3668
3669         if (index >= UI_EXTENDED_SIZE) {
3670                 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3671                 return;
3672         }
3673
3674         dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3675                  uuid_str[index],
3676                  (unsigned long long)mdev->ldev->md.uuid[index]);
3677 }
3678
3679
3680 /**
3681  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3682  * @mdev:       DRBD device.
3683  *
3684  * Call this function if you change anything that should be written to
3685  * the meta-data super block. This function sets MD_DIRTY, and starts a
3686  * timer that ensures that within five seconds you have to call drbd_md_sync().
3687  */
3688 #ifdef DEBUG
3689 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3690 {
3691         if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3692                 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3693                 mdev->last_md_mark_dirty.line = line;
3694                 mdev->last_md_mark_dirty.func = func;
3695         }
3696 }
3697 #else
3698 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3699 {
3700         if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3701                 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3702 }
3703 #endif
3704
3705 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3706 {
3707         int i;
3708
3709         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3710                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3711                 debug_drbd_uuid(mdev, i+1);
3712         }
3713 }
3714
3715 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3716 {
3717         if (idx == UI_CURRENT) {
3718                 if (mdev->state.role == R_PRIMARY)
3719                         val |= 1;
3720                 else
3721                         val &= ~((u64)1);
3722
3723                 drbd_set_ed_uuid(mdev, val);
3724         }
3725
3726         mdev->ldev->md.uuid[idx] = val;
3727         debug_drbd_uuid(mdev, idx);
3728         drbd_md_mark_dirty(mdev);
3729 }
3730
3731
3732 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3733 {
3734         if (mdev->ldev->md.uuid[idx]) {
3735                 drbd_uuid_move_history(mdev);
3736                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3737                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3738         }
3739         _drbd_uuid_set(mdev, idx, val);
3740 }
3741
3742 /**
3743  * drbd_uuid_new_current() - Creates a new current UUID
3744  * @mdev:       DRBD device.
3745  *
3746  * Creates a new current UUID, and rotates the old current UUID into
3747  * the bitmap slot. Causes an incremental resync upon next connect.
3748  */
3749 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3750 {
3751         u64 val;
3752
3753         dev_info(DEV, "Creating new current UUID\n");
3754         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3755         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3756         debug_drbd_uuid(mdev, UI_BITMAP);
3757
3758         get_random_bytes(&val, sizeof(u64));
3759         _drbd_uuid_set(mdev, UI_CURRENT, val);
3760         /* get it to stable storage _now_ */
3761         drbd_md_sync(mdev);
3762 }
3763
3764 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3765 {
3766         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3767                 return;
3768
3769         if (val == 0) {
3770                 drbd_uuid_move_history(mdev);
3771                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3772                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3773                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3774                 debug_drbd_uuid(mdev, UI_BITMAP);
3775         } else {
3776                 if (mdev->ldev->md.uuid[UI_BITMAP])
3777                         dev_warn(DEV, "bm UUID already set");
3778
3779                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3780                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3781
3782                 debug_drbd_uuid(mdev, UI_BITMAP);
3783         }
3784         drbd_md_mark_dirty(mdev);
3785 }
3786
3787 /**
3788  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3789  * @mdev:       DRBD device.
3790  *
3791  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3792  */
3793 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3794 {
3795         int rv = -EIO;
3796
3797         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3798                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3799                 drbd_md_sync(mdev);
3800                 drbd_bm_set_all(mdev);
3801
3802                 rv = drbd_bm_write(mdev);
3803
3804                 if (!rv) {
3805                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3806                         drbd_md_sync(mdev);
3807                 }
3808
3809                 put_ldev(mdev);
3810         }
3811
3812         return rv;
3813 }
3814
3815 /**
3816  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3817  * @mdev:       DRBD device.
3818  *
3819  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3820  */
3821 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3822 {
3823         int rv = -EIO;
3824
3825         drbd_resume_al(mdev);
3826         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3827                 drbd_bm_clear_all(mdev);
3828                 rv = drbd_bm_write(mdev);
3829                 put_ldev(mdev);
3830         }
3831
3832         return rv;
3833 }
3834
3835 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3836 {
3837         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3838         int rv;
3839
3840         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3841
3842         drbd_bm_lock(mdev, work->why);
3843         rv = work->io_fn(mdev);
3844         drbd_bm_unlock(mdev);
3845
3846         clear_bit(BITMAP_IO, &mdev->flags);
3847         smp_mb__after_clear_bit();
3848         wake_up(&mdev->misc_wait);
3849
3850         if (work->done)
3851                 work->done(mdev, rv);
3852
3853         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3854         work->why = NULL;
3855
3856         return 1;
3857 }
3858
3859 void drbd_ldev_destroy(struct drbd_conf *mdev)
3860 {
3861         lc_destroy(mdev->resync);
3862         mdev->resync = NULL;
3863         lc_destroy(mdev->act_log);
3864         mdev->act_log = NULL;
3865         __no_warn(local,
3866                 drbd_free_bc(mdev->ldev);
3867                 mdev->ldev = NULL;);
3868
3869         if (mdev->md_io_tmpp) {
3870                 __free_page(mdev->md_io_tmpp);
3871                 mdev->md_io_tmpp = NULL;
3872         }
3873         clear_bit(GO_DISKLESS, &mdev->flags);
3874 }
3875
3876 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3877 {
3878         D_ASSERT(mdev->state.disk == D_FAILED);
3879         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3880          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3881          * the protected members anymore, though, so once put_ldev reaches zero
3882          * again, it will be safe to free them. */
3883         drbd_force_state(mdev, NS(disk, D_DISKLESS));
3884         return 1;
3885 }
3886
3887 void drbd_go_diskless(struct drbd_conf *mdev)
3888 {
3889         D_ASSERT(mdev->state.disk == D_FAILED);
3890         if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3891                 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3892 }
3893
3894 /**
3895  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3896  * @mdev:       DRBD device.
3897  * @io_fn:      IO callback to be called when bitmap IO is possible
3898  * @done:       callback to be called after the bitmap IO was performed
3899  * @why:        Descriptive text of the reason for doing the IO
3900  *
3901  * While IO on the bitmap happens we freeze application IO thus we ensure
3902  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3903  * called from worker context. It MUST NOT be used while a previous such
3904  * work is still pending!
3905  */
3906 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3907                           int (*io_fn)(struct drbd_conf *),
3908                           void (*done)(struct drbd_conf *, int),
3909                           char *why)
3910 {
3911         D_ASSERT(current == mdev->worker.task);
3912
3913         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3914         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3915         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3916         if (mdev->bm_io_work.why)
3917                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3918                         why, mdev->bm_io_work.why);
3919
3920         mdev->bm_io_work.io_fn = io_fn;
3921         mdev->bm_io_work.done = done;
3922         mdev->bm_io_work.why = why;
3923
3924         spin_lock_irq(&mdev->req_lock);
3925         set_bit(BITMAP_IO, &mdev->flags);
3926         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3927                 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3928                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3929         }
3930         spin_unlock_irq(&mdev->req_lock);
3931 }
3932
3933 /**
3934  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3935  * @mdev:       DRBD device.
3936  * @io_fn:      IO callback to be called when bitmap IO is possible
3937  * @why:        Descriptive text of the reason for doing the IO
3938  *
3939  * freezes application IO while that the actual IO operations runs. This
3940  * functions MAY NOT be called from worker context.
3941  */
3942 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3943 {
3944         int rv;
3945
3946         D_ASSERT(current != mdev->worker.task);
3947
3948         drbd_suspend_io(mdev);
3949
3950         drbd_bm_lock(mdev, why);
3951         rv = io_fn(mdev);
3952         drbd_bm_unlock(mdev);
3953
3954         drbd_resume_io(mdev);
3955
3956         return rv;
3957 }
3958
3959 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3960 {
3961         if ((mdev->ldev->md.flags & flag) != flag) {
3962                 drbd_md_mark_dirty(mdev);
3963                 mdev->ldev->md.flags |= flag;
3964         }
3965 }
3966
3967 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3968 {
3969         if ((mdev->ldev->md.flags & flag) != 0) {
3970                 drbd_md_mark_dirty(mdev);
3971                 mdev->ldev->md.flags &= ~flag;
3972         }
3973 }
3974 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3975 {
3976         return (bdev->md.flags & flag) != 0;
3977 }
3978
3979 static void md_sync_timer_fn(unsigned long data)
3980 {
3981         struct drbd_conf *mdev = (struct drbd_conf *) data;
3982
3983         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3984 }
3985
3986 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3987 {
3988         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3989 #ifdef DEBUG
3990         dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3991                 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3992 #endif
3993         drbd_md_sync(mdev);
3994         return 1;
3995 }
3996
3997 #ifdef CONFIG_DRBD_FAULT_INJECTION
3998 /* Fault insertion support including random number generator shamelessly
3999  * stolen from kernel/rcutorture.c */
4000 struct fault_random_state {
4001         unsigned long state;
4002         unsigned long count;
4003 };
4004
4005 #define FAULT_RANDOM_MULT 39916801  /* prime */
4006 #define FAULT_RANDOM_ADD        479001701 /* prime */
4007 #define FAULT_RANDOM_REFRESH 10000
4008
4009 /*
4010  * Crude but fast random-number generator.  Uses a linear congruential
4011  * generator, with occasional help from get_random_bytes().
4012  */
4013 static unsigned long
4014 _drbd_fault_random(struct fault_random_state *rsp)
4015 {
4016         long refresh;
4017
4018         if (!rsp->count--) {
4019                 get_random_bytes(&refresh, sizeof(refresh));
4020                 rsp->state += refresh;
4021                 rsp->count = FAULT_RANDOM_REFRESH;
4022         }
4023         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4024         return swahw32(rsp->state);
4025 }
4026
4027 static char *
4028 _drbd_fault_str(unsigned int type) {
4029         static char *_faults[] = {
4030                 [DRBD_FAULT_MD_WR] = "Meta-data write",
4031                 [DRBD_FAULT_MD_RD] = "Meta-data read",
4032                 [DRBD_FAULT_RS_WR] = "Resync write",
4033                 [DRBD_FAULT_RS_RD] = "Resync read",
4034                 [DRBD_FAULT_DT_WR] = "Data write",
4035                 [DRBD_FAULT_DT_RD] = "Data read",
4036                 [DRBD_FAULT_DT_RA] = "Data read ahead",
4037                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4038                 [DRBD_FAULT_AL_EE] = "EE allocation",
4039                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4040         };
4041
4042         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4043 }
4044
4045 unsigned int
4046 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4047 {
4048         static struct fault_random_state rrs = {0, 0};
4049
4050         unsigned int ret = (
4051                 (fault_devs == 0 ||
4052                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4053                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4054
4055         if (ret) {
4056                 fault_count++;
4057
4058                 if (__ratelimit(&drbd_ratelimit_state))
4059                         dev_warn(DEV, "***Simulating %s failure\n",
4060                                 _drbd_fault_str(type));
4061         }
4062
4063         return ret;
4064 }
4065 #endif
4066
4067 const char *drbd_buildtag(void)
4068 {
4069         /* DRBD built from external sources has here a reference to the
4070            git hash of the source code. */
4071
4072         static char buildtag[38] = "\0uilt-in";
4073
4074         if (buildtag[0] == 0) {
4075 #ifdef CONFIG_MODULES
4076                 if (THIS_MODULE != NULL)
4077                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4078                 else
4079 #endif
4080                         buildtag[0] = 'b';
4081         }
4082
4083         return buildtag;
4084 }
4085
4086 module_init(drbd_init)
4087 module_exit(drbd_cleanup)
4088
4089 EXPORT_SYMBOL(drbd_conn_str);
4090 EXPORT_SYMBOL(drbd_role_str);
4091 EXPORT_SYMBOL(drbd_disk_str);
4092 EXPORT_SYMBOL(drbd_set_st_err_str);