7eb447d20cced7e345f1f1025410553c3c0dc97c
[pandora-kernel.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91 #include <linux/moduleparam.h>
92 /* allow_open_on_secondary */
93 MODULE_PARM_DESC(allow_oos, "DONT USE!");
94 /* thanks to these macros, if compiled into the kernel (not-module),
95  * this becomes the boot parameter drbd.minor_count */
96 module_param(minor_count, uint, 0444);
97 module_param(disable_sendpage, bool, 0644);
98 module_param(allow_oos, bool, 0);
99 module_param(cn_idx, uint, 0444);
100 module_param(proc_details, int, 0644);
101
102 #ifdef CONFIG_DRBD_FAULT_INJECTION
103 int enable_faults;
104 int fault_rate;
105 static int fault_count;
106 int fault_devs;
107 /* bitmap of enabled faults */
108 module_param(enable_faults, int, 0664);
109 /* fault rate % value - applies to all enabled faults */
110 module_param(fault_rate, int, 0664);
111 /* count of faults inserted */
112 module_param(fault_count, int, 0664);
113 /* bitmap of devices to insert faults on */
114 module_param(fault_devs, int, 0644);
115 #endif
116
117 /* module parameter, defined */
118 unsigned int minor_count = 32;
119 int disable_sendpage;
120 int allow_oos;
121 unsigned int cn_idx = CN_IDX_DRBD;
122 int proc_details;       /* Detail level in proc drbd*/
123
124 /* Module parameter for setting the user mode helper program
125  * to run. Default is /sbin/drbdadm */
126 char usermode_helper[80] = "/sbin/drbdadm";
127
128 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
131  * as member "struct gendisk *vdisk;"
132  */
133 struct drbd_conf **minor_table;
134
135 struct kmem_cache *drbd_request_cache;
136 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
137 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
138 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
139 mempool_t *drbd_request_mempool;
140 mempool_t *drbd_ee_mempool;
141
142 /* I do not use a standard mempool, because:
143    1) I want to hand out the pre-allocated objects first.
144    2) I want to be able to interrupt sleeping allocation with a signal.
145    Note: This is a single linked list, the next pointer is the private
146          member of struct page.
147  */
148 struct page *drbd_pp_pool;
149 spinlock_t   drbd_pp_lock;
150 int          drbd_pp_vacant;
151 wait_queue_head_t drbd_pp_wait;
152
153 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
155 static const struct block_device_operations drbd_ops = {
156         .owner =   THIS_MODULE,
157         .open =    drbd_open,
158         .release = drbd_release,
159 };
160
161 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163 #ifdef __CHECKER__
164 /* When checking with sparse, and this is an inline function, sparse will
165    give tons of false positives. When this is a real functions sparse works.
166  */
167 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168 {
169         int io_allowed;
170
171         atomic_inc(&mdev->local_cnt);
172         io_allowed = (mdev->state.disk >= mins);
173         if (!io_allowed) {
174                 if (atomic_dec_and_test(&mdev->local_cnt))
175                         wake_up(&mdev->misc_wait);
176         }
177         return io_allowed;
178 }
179
180 #endif
181
182 /**
183  * DOC: The transfer log
184  *
185  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187  * of the list. There is always at least one &struct drbd_tl_epoch object.
188  *
189  * Each &struct drbd_tl_epoch has a circular double linked list of requests
190  * attached.
191  */
192 static int tl_init(struct drbd_conf *mdev)
193 {
194         struct drbd_tl_epoch *b;
195
196         /* during device minor initialization, we may well use GFP_KERNEL */
197         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198         if (!b)
199                 return 0;
200         INIT_LIST_HEAD(&b->requests);
201         INIT_LIST_HEAD(&b->w.list);
202         b->next = NULL;
203         b->br_number = 4711;
204         b->n_writes = 0;
205         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207         mdev->oldest_tle = b;
208         mdev->newest_tle = b;
209         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211         mdev->tl_hash = NULL;
212         mdev->tl_hash_s = 0;
213
214         return 1;
215 }
216
217 static void tl_cleanup(struct drbd_conf *mdev)
218 {
219         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221         kfree(mdev->oldest_tle);
222         mdev->oldest_tle = NULL;
223         kfree(mdev->unused_spare_tle);
224         mdev->unused_spare_tle = NULL;
225         kfree(mdev->tl_hash);
226         mdev->tl_hash = NULL;
227         mdev->tl_hash_s = 0;
228 }
229
230 /**
231  * _tl_add_barrier() - Adds a barrier to the transfer log
232  * @mdev:       DRBD device.
233  * @new:        Barrier to be added before the current head of the TL.
234  *
235  * The caller must hold the req_lock.
236  */
237 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238 {
239         struct drbd_tl_epoch *newest_before;
240
241         INIT_LIST_HEAD(&new->requests);
242         INIT_LIST_HEAD(&new->w.list);
243         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244         new->next = NULL;
245         new->n_writes = 0;
246
247         newest_before = mdev->newest_tle;
248         /* never send a barrier number == 0, because that is special-cased
249          * when using TCQ for our write ordering code */
250         new->br_number = (newest_before->br_number+1) ?: 1;
251         if (mdev->newest_tle != new) {
252                 mdev->newest_tle->next = new;
253                 mdev->newest_tle = new;
254         }
255 }
256
257 /**
258  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259  * @mdev:       DRBD device.
260  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261  * @set_size:   Expected number of requests before that barrier.
262  *
263  * In case the passed barrier_nr or set_size does not match the oldest
264  * &struct drbd_tl_epoch objects this function will cause a termination
265  * of the connection.
266  */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268                        unsigned int set_size)
269 {
270         struct drbd_tl_epoch *b, *nob; /* next old barrier */
271         struct list_head *le, *tle;
272         struct drbd_request *r;
273
274         spin_lock_irq(&mdev->req_lock);
275
276         b = mdev->oldest_tle;
277
278         /* first some paranoia code */
279         if (b == NULL) {
280                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281                         barrier_nr);
282                 goto bail;
283         }
284         if (b->br_number != barrier_nr) {
285                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286                         barrier_nr, b->br_number);
287                 goto bail;
288         }
289         if (b->n_writes != set_size) {
290                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291                         barrier_nr, set_size, b->n_writes);
292                 goto bail;
293         }
294
295         /* Clean up list of requests processed during current epoch */
296         list_for_each_safe(le, tle, &b->requests) {
297                 r = list_entry(le, struct drbd_request, tl_requests);
298                 _req_mod(r, barrier_acked);
299         }
300         /* There could be requests on the list waiting for completion
301            of the write to the local disk. To avoid corruptions of
302            slab's data structures we have to remove the lists head.
303
304            Also there could have been a barrier ack out of sequence, overtaking
305            the write acks - which would be a bug and violating write ordering.
306            To not deadlock in case we lose connection while such requests are
307            still pending, we need some way to find them for the
308            _req_mode(connection_lost_while_pending).
309
310            These have been list_move'd to the out_of_sequence_requests list in
311            _req_mod(, barrier_acked) above.
312            */
313         list_del_init(&b->requests);
314
315         nob = b->next;
316         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317                 _tl_add_barrier(mdev, b);
318                 if (nob)
319                         mdev->oldest_tle = nob;
320                 /* if nob == NULL b was the only barrier, and becomes the new
321                    barrier. Therefore mdev->oldest_tle points already to b */
322         } else {
323                 D_ASSERT(nob != NULL);
324                 mdev->oldest_tle = nob;
325                 kfree(b);
326         }
327
328         spin_unlock_irq(&mdev->req_lock);
329         dec_ap_pending(mdev);
330
331         return;
332
333 bail:
334         spin_unlock_irq(&mdev->req_lock);
335         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338 /**
339  * _tl_restart() - Walks the transfer log, and applies an action to all requests
340  * @mdev:       DRBD device.
341  * @what:       The action/event to perform with all request objects
342  *
343  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344  * restart_frozen_disk_io.
345  */
346 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347 {
348         struct drbd_tl_epoch *b, *tmp, **pn;
349         struct list_head *le, *tle, carry_reads;
350         struct drbd_request *req;
351         int rv, n_writes, n_reads;
352
353         b = mdev->oldest_tle;
354         pn = &mdev->oldest_tle;
355         while (b) {
356                 n_writes = 0;
357                 n_reads = 0;
358                 INIT_LIST_HEAD(&carry_reads);
359                 list_for_each_safe(le, tle, &b->requests) {
360                         req = list_entry(le, struct drbd_request, tl_requests);
361                         rv = _req_mod(req, what);
362
363                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
365                 }
366                 tmp = b->next;
367
368                 if (n_writes) {
369                         if (what == resend) {
370                                 b->n_writes = n_writes;
371                                 if (b->w.cb == NULL) {
372                                         b->w.cb = w_send_barrier;
373                                         inc_ap_pending(mdev);
374                                         set_bit(CREATE_BARRIER, &mdev->flags);
375                                 }
376
377                                 drbd_queue_work(&mdev->data.work, &b->w);
378                         }
379                         pn = &b->next;
380                 } else {
381                         if (n_reads)
382                                 list_add(&carry_reads, &b->requests);
383                         /* there could still be requests on that ring list,
384                          * in case local io is still pending */
385                         list_del(&b->requests);
386
387                         /* dec_ap_pending corresponding to queue_barrier.
388                          * the newest barrier may not have been queued yet,
389                          * in which case w.cb is still NULL. */
390                         if (b->w.cb != NULL)
391                                 dec_ap_pending(mdev);
392
393                         if (b == mdev->newest_tle) {
394                                 /* recycle, but reinit! */
395                                 D_ASSERT(tmp == NULL);
396                                 INIT_LIST_HEAD(&b->requests);
397                                 list_splice(&carry_reads, &b->requests);
398                                 INIT_LIST_HEAD(&b->w.list);
399                                 b->w.cb = NULL;
400                                 b->br_number = net_random();
401                                 b->n_writes = 0;
402
403                                 *pn = b;
404                                 break;
405                         }
406                         *pn = tmp;
407                         kfree(b);
408                 }
409                 b = tmp;
410                 list_splice(&carry_reads, &b->requests);
411         }
412 }
413
414
415 /**
416  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417  * @mdev:       DRBD device.
418  *
419  * This is called after the connection to the peer was lost. The storage covered
420  * by the requests on the transfer gets marked as our of sync. Called from the
421  * receiver thread and the worker thread.
422  */
423 void tl_clear(struct drbd_conf *mdev)
424 {
425         struct list_head *le, *tle;
426         struct drbd_request *r;
427
428         spin_lock_irq(&mdev->req_lock);
429
430         _tl_restart(mdev, connection_lost_while_pending);
431
432         /* we expect this list to be empty. */
433         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435         /* but just in case, clean it up anyways! */
436         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437                 r = list_entry(le, struct drbd_request, tl_requests);
438                 /* It would be nice to complete outside of spinlock.
439                  * But this is easier for now. */
440                 _req_mod(r, connection_lost_while_pending);
441         }
442
443         /* ensure bit indicating barrier is required is clear */
444         clear_bit(CREATE_BARRIER, &mdev->flags);
445
446         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448         spin_unlock_irq(&mdev->req_lock);
449 }
450
451 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452 {
453         spin_lock_irq(&mdev->req_lock);
454         _tl_restart(mdev, what);
455         spin_unlock_irq(&mdev->req_lock);
456 }
457
458 /**
459  * cl_wide_st_chg() - true if the state change is a cluster wide one
460  * @mdev:       DRBD device.
461  * @os:         old (current) state.
462  * @ns:         new (wanted) state.
463  */
464 static int cl_wide_st_chg(struct drbd_conf *mdev,
465                           union drbd_state os, union drbd_state ns)
466 {
467         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474 }
475
476 enum drbd_state_rv
477 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
478                   union drbd_state mask, union drbd_state val)
479 {
480         unsigned long flags;
481         union drbd_state os, ns;
482         enum drbd_state_rv rv;
483
484         spin_lock_irqsave(&mdev->req_lock, flags);
485         os = mdev->state;
486         ns.i = (os.i & ~mask.i) | val.i;
487         rv = _drbd_set_state(mdev, ns, f, NULL);
488         ns = mdev->state;
489         spin_unlock_irqrestore(&mdev->req_lock, flags);
490
491         return rv;
492 }
493
494 /**
495  * drbd_force_state() - Impose a change which happens outside our control on our state
496  * @mdev:       DRBD device.
497  * @mask:       mask of state bits to change.
498  * @val:        value of new state bits.
499  */
500 void drbd_force_state(struct drbd_conf *mdev,
501         union drbd_state mask, union drbd_state val)
502 {
503         drbd_change_state(mdev, CS_HARD, mask, val);
504 }
505
506 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
507 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
508                                                     union drbd_state,
509                                                     union drbd_state);
510 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
511                                        union drbd_state ns, const char **warn_sync_abort);
512 int drbd_send_state_req(struct drbd_conf *,
513                         union drbd_state, union drbd_state);
514
515 static enum drbd_state_rv
516 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
517              union drbd_state val)
518 {
519         union drbd_state os, ns;
520         unsigned long flags;
521         enum drbd_state_rv rv;
522
523         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
524                 return SS_CW_SUCCESS;
525
526         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
527                 return SS_CW_FAILED_BY_PEER;
528
529         rv = 0;
530         spin_lock_irqsave(&mdev->req_lock, flags);
531         os = mdev->state;
532         ns.i = (os.i & ~mask.i) | val.i;
533         ns = sanitize_state(mdev, os, ns, NULL);
534
535         if (!cl_wide_st_chg(mdev, os, ns))
536                 rv = SS_CW_NO_NEED;
537         if (!rv) {
538                 rv = is_valid_state(mdev, ns);
539                 if (rv == SS_SUCCESS) {
540                         rv = is_valid_state_transition(mdev, ns, os);
541                         if (rv == SS_SUCCESS)
542                                 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
543                 }
544         }
545         spin_unlock_irqrestore(&mdev->req_lock, flags);
546
547         return rv;
548 }
549
550 /**
551  * drbd_req_state() - Perform an eventually cluster wide state change
552  * @mdev:       DRBD device.
553  * @mask:       mask of state bits to change.
554  * @val:        value of new state bits.
555  * @f:          flags
556  *
557  * Should not be called directly, use drbd_request_state() or
558  * _drbd_request_state().
559  */
560 static enum drbd_state_rv
561 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
562                union drbd_state val, enum chg_state_flags f)
563 {
564         struct completion done;
565         unsigned long flags;
566         union drbd_state os, ns;
567         enum drbd_state_rv rv;
568
569         init_completion(&done);
570
571         if (f & CS_SERIALIZE)
572                 mutex_lock(&mdev->state_mutex);
573
574         spin_lock_irqsave(&mdev->req_lock, flags);
575         os = mdev->state;
576         ns.i = (os.i & ~mask.i) | val.i;
577         ns = sanitize_state(mdev, os, ns, NULL);
578
579         if (cl_wide_st_chg(mdev, os, ns)) {
580                 rv = is_valid_state(mdev, ns);
581                 if (rv == SS_SUCCESS)
582                         rv = is_valid_state_transition(mdev, ns, os);
583                 spin_unlock_irqrestore(&mdev->req_lock, flags);
584
585                 if (rv < SS_SUCCESS) {
586                         if (f & CS_VERBOSE)
587                                 print_st_err(mdev, os, ns, rv);
588                         goto abort;
589                 }
590
591                 drbd_state_lock(mdev);
592                 if (!drbd_send_state_req(mdev, mask, val)) {
593                         drbd_state_unlock(mdev);
594                         rv = SS_CW_FAILED_BY_PEER;
595                         if (f & CS_VERBOSE)
596                                 print_st_err(mdev, os, ns, rv);
597                         goto abort;
598                 }
599
600                 wait_event(mdev->state_wait,
601                         (rv = _req_st_cond(mdev, mask, val)));
602
603                 if (rv < SS_SUCCESS) {
604                         drbd_state_unlock(mdev);
605                         if (f & CS_VERBOSE)
606                                 print_st_err(mdev, os, ns, rv);
607                         goto abort;
608                 }
609                 spin_lock_irqsave(&mdev->req_lock, flags);
610                 os = mdev->state;
611                 ns.i = (os.i & ~mask.i) | val.i;
612                 rv = _drbd_set_state(mdev, ns, f, &done);
613                 drbd_state_unlock(mdev);
614         } else {
615                 rv = _drbd_set_state(mdev, ns, f, &done);
616         }
617
618         spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
621                 D_ASSERT(current != mdev->worker.task);
622                 wait_for_completion(&done);
623         }
624
625 abort:
626         if (f & CS_SERIALIZE)
627                 mutex_unlock(&mdev->state_mutex);
628
629         return rv;
630 }
631
632 /**
633  * _drbd_request_state() - Request a state change (with flags)
634  * @mdev:       DRBD device.
635  * @mask:       mask of state bits to change.
636  * @val:        value of new state bits.
637  * @f:          flags
638  *
639  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
640  * flag, or when logging of failed state change requests is not desired.
641  */
642 enum drbd_state_rv
643 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
644                     union drbd_state val, enum chg_state_flags f)
645 {
646         enum drbd_state_rv rv;
647
648         wait_event(mdev->state_wait,
649                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
650
651         return rv;
652 }
653
654 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
655 {
656         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
657             name,
658             drbd_conn_str(ns.conn),
659             drbd_role_str(ns.role),
660             drbd_role_str(ns.peer),
661             drbd_disk_str(ns.disk),
662             drbd_disk_str(ns.pdsk),
663             is_susp(ns) ? 's' : 'r',
664             ns.aftr_isp ? 'a' : '-',
665             ns.peer_isp ? 'p' : '-',
666             ns.user_isp ? 'u' : '-'
667             );
668 }
669
670 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
671                   union drbd_state ns, enum drbd_state_rv err)
672 {
673         if (err == SS_IN_TRANSIENT_STATE)
674                 return;
675         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
676         print_st(mdev, " state", os);
677         print_st(mdev, "wanted", ns);
678 }
679
680
681 /**
682  * is_valid_state() - Returns an SS_ error code if ns is not valid
683  * @mdev:       DRBD device.
684  * @ns:         State to consider.
685  */
686 static enum drbd_state_rv
687 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
688 {
689         /* See drbd_state_sw_errors in drbd_strings.c */
690
691         enum drbd_fencing_p fp;
692         enum drbd_state_rv rv = SS_SUCCESS;
693
694         fp = FP_DONT_CARE;
695         if (get_ldev(mdev)) {
696                 fp = mdev->ldev->dc.fencing;
697                 put_ldev(mdev);
698         }
699
700         if (get_net_conf(mdev)) {
701                 if (!mdev->net_conf->two_primaries &&
702                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
703                         rv = SS_TWO_PRIMARIES;
704                 put_net_conf(mdev);
705         }
706
707         if (rv <= 0)
708                 /* already found a reason to abort */;
709         else if (ns.role == R_SECONDARY && mdev->open_cnt)
710                 rv = SS_DEVICE_IN_USE;
711
712         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
713                 rv = SS_NO_UP_TO_DATE_DISK;
714
715         else if (fp >= FP_RESOURCE &&
716                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
717                 rv = SS_PRIMARY_NOP;
718
719         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
720                 rv = SS_NO_UP_TO_DATE_DISK;
721
722         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
723                 rv = SS_NO_LOCAL_DISK;
724
725         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
726                 rv = SS_NO_REMOTE_DISK;
727
728         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
729                 rv = SS_NO_UP_TO_DATE_DISK;
730
731         else if ((ns.conn == C_CONNECTED ||
732                   ns.conn == C_WF_BITMAP_S ||
733                   ns.conn == C_SYNC_SOURCE ||
734                   ns.conn == C_PAUSED_SYNC_S) &&
735                   ns.disk == D_OUTDATED)
736                 rv = SS_CONNECTED_OUTDATES;
737
738         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
739                  (mdev->sync_conf.verify_alg[0] == 0))
740                 rv = SS_NO_VERIFY_ALG;
741
742         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
743                   mdev->agreed_pro_version < 88)
744                 rv = SS_NOT_SUPPORTED;
745
746         return rv;
747 }
748
749 /**
750  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
751  * @mdev:       DRBD device.
752  * @ns:         new state.
753  * @os:         old state.
754  */
755 static enum drbd_state_rv
756 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
757                           union drbd_state os)
758 {
759         enum drbd_state_rv rv = SS_SUCCESS;
760
761         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
762             os.conn > C_CONNECTED)
763                 rv = SS_RESYNC_RUNNING;
764
765         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
766                 rv = SS_ALREADY_STANDALONE;
767
768         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
769                 rv = SS_IS_DISKLESS;
770
771         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
772                 rv = SS_NO_NET_CONFIG;
773
774         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
775                 rv = SS_LOWER_THAN_OUTDATED;
776
777         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
778                 rv = SS_IN_TRANSIENT_STATE;
779
780         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
781                 rv = SS_IN_TRANSIENT_STATE;
782
783         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
784                 rv = SS_NEED_CONNECTION;
785
786         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
787             ns.conn != os.conn && os.conn > C_CONNECTED)
788                 rv = SS_RESYNC_RUNNING;
789
790         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
791             os.conn < C_CONNECTED)
792                 rv = SS_NEED_CONNECTION;
793
794         if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
795             && os.conn < C_WF_REPORT_PARAMS)
796                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
797
798         return rv;
799 }
800
801 /**
802  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
803  * @mdev:       DRBD device.
804  * @os:         old state.
805  * @ns:         new state.
806  * @warn_sync_abort:
807  *
808  * When we loose connection, we have to set the state of the peers disk (pdsk)
809  * to D_UNKNOWN. This rule and many more along those lines are in this function.
810  */
811 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
812                                        union drbd_state ns, const char **warn_sync_abort)
813 {
814         enum drbd_fencing_p fp;
815         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
816
817         fp = FP_DONT_CARE;
818         if (get_ldev(mdev)) {
819                 fp = mdev->ldev->dc.fencing;
820                 put_ldev(mdev);
821         }
822
823         /* Disallow Network errors to configure a device's network part */
824         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
825             os.conn <= C_DISCONNECTING)
826                 ns.conn = os.conn;
827
828         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
829          * If you try to go into some Sync* state, that shall fail (elsewhere). */
830         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
831             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
832                 ns.conn = os.conn;
833
834         /* we cannot fail (again) if we already detached */
835         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
836                 ns.disk = D_DISKLESS;
837
838         /* if we are only D_ATTACHING yet,
839          * we can (and should) go directly to D_DISKLESS. */
840         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
841                 ns.disk = D_DISKLESS;
842
843         /* After C_DISCONNECTING only C_STANDALONE may follow */
844         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
845                 ns.conn = os.conn;
846
847         if (ns.conn < C_CONNECTED) {
848                 ns.peer_isp = 0;
849                 ns.peer = R_UNKNOWN;
850                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
851                         ns.pdsk = D_UNKNOWN;
852         }
853
854         /* Clear the aftr_isp when becoming unconfigured */
855         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
856                 ns.aftr_isp = 0;
857
858         /* Abort resync if a disk fails/detaches */
859         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
860             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
861                 if (warn_sync_abort)
862                         *warn_sync_abort =
863                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
864                                 "Online-verify" : "Resync";
865                 ns.conn = C_CONNECTED;
866         }
867
868         /* Connection breaks down before we finished "Negotiating" */
869         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
870             get_ldev_if_state(mdev, D_NEGOTIATING)) {
871                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
872                         ns.disk = mdev->new_state_tmp.disk;
873                         ns.pdsk = mdev->new_state_tmp.pdsk;
874                 } else {
875                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
876                         ns.disk = D_DISKLESS;
877                         ns.pdsk = D_UNKNOWN;
878                 }
879                 put_ldev(mdev);
880         }
881
882         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
883         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
884                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
885                         ns.disk = D_UP_TO_DATE;
886                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
887                         ns.pdsk = D_UP_TO_DATE;
888         }
889
890         /* Implications of the connection stat on the disk states */
891         disk_min = D_DISKLESS;
892         disk_max = D_UP_TO_DATE;
893         pdsk_min = D_INCONSISTENT;
894         pdsk_max = D_UNKNOWN;
895         switch ((enum drbd_conns)ns.conn) {
896         case C_WF_BITMAP_T:
897         case C_PAUSED_SYNC_T:
898         case C_STARTING_SYNC_T:
899         case C_WF_SYNC_UUID:
900         case C_BEHIND:
901                 disk_min = D_INCONSISTENT;
902                 disk_max = D_OUTDATED;
903                 pdsk_min = D_UP_TO_DATE;
904                 pdsk_max = D_UP_TO_DATE;
905                 break;
906         case C_VERIFY_S:
907         case C_VERIFY_T:
908                 disk_min = D_UP_TO_DATE;
909                 disk_max = D_UP_TO_DATE;
910                 pdsk_min = D_UP_TO_DATE;
911                 pdsk_max = D_UP_TO_DATE;
912                 break;
913         case C_CONNECTED:
914                 disk_min = D_DISKLESS;
915                 disk_max = D_UP_TO_DATE;
916                 pdsk_min = D_DISKLESS;
917                 pdsk_max = D_UP_TO_DATE;
918                 break;
919         case C_WF_BITMAP_S:
920         case C_PAUSED_SYNC_S:
921         case C_STARTING_SYNC_S:
922         case C_AHEAD:
923                 disk_min = D_UP_TO_DATE;
924                 disk_max = D_UP_TO_DATE;
925                 pdsk_min = D_INCONSISTENT;
926                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
927                 break;
928         case C_SYNC_TARGET:
929                 disk_min = D_INCONSISTENT;
930                 disk_max = D_INCONSISTENT;
931                 pdsk_min = D_UP_TO_DATE;
932                 pdsk_max = D_UP_TO_DATE;
933                 break;
934         case C_SYNC_SOURCE:
935                 disk_min = D_UP_TO_DATE;
936                 disk_max = D_UP_TO_DATE;
937                 pdsk_min = D_INCONSISTENT;
938                 pdsk_max = D_INCONSISTENT;
939                 break;
940         case C_STANDALONE:
941         case C_DISCONNECTING:
942         case C_UNCONNECTED:
943         case C_TIMEOUT:
944         case C_BROKEN_PIPE:
945         case C_NETWORK_FAILURE:
946         case C_PROTOCOL_ERROR:
947         case C_TEAR_DOWN:
948         case C_WF_CONNECTION:
949         case C_WF_REPORT_PARAMS:
950         case C_MASK:
951                 break;
952         }
953         if (ns.disk > disk_max)
954                 ns.disk = disk_max;
955
956         if (ns.disk < disk_min) {
957                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
958                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
959                 ns.disk = disk_min;
960         }
961         if (ns.pdsk > pdsk_max)
962                 ns.pdsk = pdsk_max;
963
964         if (ns.pdsk < pdsk_min) {
965                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
966                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
967                 ns.pdsk = pdsk_min;
968         }
969
970         if (fp == FP_STONITH &&
971             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
972             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
973                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
974
975         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
976             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
977             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
978                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
979
980         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
981                 if (ns.conn == C_SYNC_SOURCE)
982                         ns.conn = C_PAUSED_SYNC_S;
983                 if (ns.conn == C_SYNC_TARGET)
984                         ns.conn = C_PAUSED_SYNC_T;
985         } else {
986                 if (ns.conn == C_PAUSED_SYNC_S)
987                         ns.conn = C_SYNC_SOURCE;
988                 if (ns.conn == C_PAUSED_SYNC_T)
989                         ns.conn = C_SYNC_TARGET;
990         }
991
992         return ns;
993 }
994
995 /* helper for __drbd_set_state */
996 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
997 {
998         if (mdev->agreed_pro_version < 90)
999                 mdev->ov_start_sector = 0;
1000         mdev->rs_total = drbd_bm_bits(mdev);
1001         mdev->ov_position = 0;
1002         if (cs == C_VERIFY_T) {
1003                 /* starting online verify from an arbitrary position
1004                  * does not fit well into the existing protocol.
1005                  * on C_VERIFY_T, we initialize ov_left and friends
1006                  * implicitly in receive_DataRequest once the
1007                  * first P_OV_REQUEST is received */
1008                 mdev->ov_start_sector = ~(sector_t)0;
1009         } else {
1010                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1011                 if (bit >= mdev->rs_total) {
1012                         mdev->ov_start_sector =
1013                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1014                         mdev->rs_total = 1;
1015                 } else
1016                         mdev->rs_total -= bit;
1017                 mdev->ov_position = mdev->ov_start_sector;
1018         }
1019         mdev->ov_left = mdev->rs_total;
1020 }
1021
1022 static void drbd_resume_al(struct drbd_conf *mdev)
1023 {
1024         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1025                 dev_info(DEV, "Resumed AL updates\n");
1026 }
1027
1028 /**
1029  * __drbd_set_state() - Set a new DRBD state
1030  * @mdev:       DRBD device.
1031  * @ns:         new state.
1032  * @flags:      Flags
1033  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1034  *
1035  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1036  */
1037 enum drbd_state_rv
1038 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1039                  enum chg_state_flags flags, struct completion *done)
1040 {
1041         union drbd_state os;
1042         enum drbd_state_rv rv = SS_SUCCESS;
1043         const char *warn_sync_abort = NULL;
1044         struct after_state_chg_work *ascw;
1045
1046         os = mdev->state;
1047
1048         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1049
1050         if (ns.i == os.i)
1051                 return SS_NOTHING_TO_DO;
1052
1053         if (!(flags & CS_HARD)) {
1054                 /*  pre-state-change checks ; only look at ns  */
1055                 /* See drbd_state_sw_errors in drbd_strings.c */
1056
1057                 rv = is_valid_state(mdev, ns);
1058                 if (rv < SS_SUCCESS) {
1059                         /* If the old state was illegal as well, then let
1060                            this happen...*/
1061
1062                         if (is_valid_state(mdev, os) == rv)
1063                                 rv = is_valid_state_transition(mdev, ns, os);
1064                 } else
1065                         rv = is_valid_state_transition(mdev, ns, os);
1066         }
1067
1068         if (rv < SS_SUCCESS) {
1069                 if (flags & CS_VERBOSE)
1070                         print_st_err(mdev, os, ns, rv);
1071                 return rv;
1072         }
1073
1074         if (warn_sync_abort)
1075                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1076
1077         {
1078         char *pbp, pb[300];
1079         pbp = pb;
1080         *pbp = 0;
1081         if (ns.role != os.role)
1082                 pbp += sprintf(pbp, "role( %s -> %s ) ",
1083                                drbd_role_str(os.role),
1084                                drbd_role_str(ns.role));
1085         if (ns.peer != os.peer)
1086                 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1087                                drbd_role_str(os.peer),
1088                                drbd_role_str(ns.peer));
1089         if (ns.conn != os.conn)
1090                 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1091                                drbd_conn_str(os.conn),
1092                                drbd_conn_str(ns.conn));
1093         if (ns.disk != os.disk)
1094                 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1095                                drbd_disk_str(os.disk),
1096                                drbd_disk_str(ns.disk));
1097         if (ns.pdsk != os.pdsk)
1098                 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1099                                drbd_disk_str(os.pdsk),
1100                                drbd_disk_str(ns.pdsk));
1101         if (is_susp(ns) != is_susp(os))
1102                 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1103                                is_susp(os),
1104                                is_susp(ns));
1105         if (ns.aftr_isp != os.aftr_isp)
1106                 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1107                                os.aftr_isp,
1108                                ns.aftr_isp);
1109         if (ns.peer_isp != os.peer_isp)
1110                 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1111                                os.peer_isp,
1112                                ns.peer_isp);
1113         if (ns.user_isp != os.user_isp)
1114                 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1115                                os.user_isp,
1116                                ns.user_isp);
1117         dev_info(DEV, "%s\n", pb);
1118         }
1119
1120         /* solve the race between becoming unconfigured,
1121          * worker doing the cleanup, and
1122          * admin reconfiguring us:
1123          * on (re)configure, first set CONFIG_PENDING,
1124          * then wait for a potentially exiting worker,
1125          * start the worker, and schedule one no_op.
1126          * then proceed with configuration.
1127          */
1128         if (ns.disk == D_DISKLESS &&
1129             ns.conn == C_STANDALONE &&
1130             ns.role == R_SECONDARY &&
1131             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1132                 set_bit(DEVICE_DYING, &mdev->flags);
1133
1134         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1135          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1136          * drbd_ldev_destroy() won't happen before our corresponding
1137          * after_state_ch works run, where we put_ldev again. */
1138         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1139             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1140                 atomic_inc(&mdev->local_cnt);
1141
1142         mdev->state = ns;
1143         wake_up(&mdev->misc_wait);
1144         wake_up(&mdev->state_wait);
1145
1146         /* aborted verify run. log the last position */
1147         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1148             ns.conn < C_CONNECTED) {
1149                 mdev->ov_start_sector =
1150                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1151                 dev_info(DEV, "Online Verify reached sector %llu\n",
1152                         (unsigned long long)mdev->ov_start_sector);
1153         }
1154
1155         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1156             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1157                 dev_info(DEV, "Syncer continues.\n");
1158                 mdev->rs_paused += (long)jiffies
1159                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1160                 if (ns.conn == C_SYNC_TARGET)
1161                         mod_timer(&mdev->resync_timer, jiffies);
1162         }
1163
1164         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1165             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1166                 dev_info(DEV, "Resync suspended\n");
1167                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1168         }
1169
1170         if (os.conn == C_CONNECTED &&
1171             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1172                 unsigned long now = jiffies;
1173                 int i;
1174
1175                 set_ov_position(mdev, ns.conn);
1176                 mdev->rs_start = now;
1177                 mdev->rs_last_events = 0;
1178                 mdev->rs_last_sect_ev = 0;
1179                 mdev->ov_last_oos_size = 0;
1180                 mdev->ov_last_oos_start = 0;
1181
1182                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1183                         mdev->rs_mark_left[i] = mdev->ov_left;
1184                         mdev->rs_mark_time[i] = now;
1185                 }
1186
1187                 drbd_rs_controller_reset(mdev);
1188
1189                 if (ns.conn == C_VERIFY_S) {
1190                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1191                                         (unsigned long long)mdev->ov_position);
1192                         mod_timer(&mdev->resync_timer, jiffies);
1193                 }
1194         }
1195
1196         if (get_ldev(mdev)) {
1197                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1198                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1199                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1200
1201                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1202                         mdf |= MDF_CRASHED_PRIMARY;
1203                 if (mdev->state.role == R_PRIMARY ||
1204                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1205                         mdf |= MDF_PRIMARY_IND;
1206                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1207                         mdf |= MDF_CONNECTED_IND;
1208                 if (mdev->state.disk > D_INCONSISTENT)
1209                         mdf |= MDF_CONSISTENT;
1210                 if (mdev->state.disk > D_OUTDATED)
1211                         mdf |= MDF_WAS_UP_TO_DATE;
1212                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1213                         mdf |= MDF_PEER_OUT_DATED;
1214                 if (mdf != mdev->ldev->md.flags) {
1215                         mdev->ldev->md.flags = mdf;
1216                         drbd_md_mark_dirty(mdev);
1217                 }
1218                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1219                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1220                 put_ldev(mdev);
1221         }
1222
1223         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1224         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1225             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1226                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1227
1228         /* Receiver should clean up itself */
1229         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1230                 drbd_thread_stop_nowait(&mdev->receiver);
1231
1232         /* Now the receiver finished cleaning up itself, it should die */
1233         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1234                 drbd_thread_stop_nowait(&mdev->receiver);
1235
1236         /* Upon network failure, we need to restart the receiver. */
1237         if (os.conn > C_TEAR_DOWN &&
1238             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1239                 drbd_thread_restart_nowait(&mdev->receiver);
1240
1241         /* Resume AL writing if we get a connection */
1242         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1243                 drbd_resume_al(mdev);
1244
1245         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1246         if (ascw) {
1247                 ascw->os = os;
1248                 ascw->ns = ns;
1249                 ascw->flags = flags;
1250                 ascw->w.cb = w_after_state_ch;
1251                 ascw->done = done;
1252                 drbd_queue_work(&mdev->data.work, &ascw->w);
1253         } else {
1254                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1255         }
1256
1257         return rv;
1258 }
1259
1260 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1261 {
1262         struct after_state_chg_work *ascw =
1263                 container_of(w, struct after_state_chg_work, w);
1264         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1265         if (ascw->flags & CS_WAIT_COMPLETE) {
1266                 D_ASSERT(ascw->done != NULL);
1267                 complete(ascw->done);
1268         }
1269         kfree(ascw);
1270
1271         return 1;
1272 }
1273
1274 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1275 {
1276         if (rv) {
1277                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1278                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1279                 return;
1280         }
1281
1282         switch (mdev->state.conn) {
1283         case C_STARTING_SYNC_T:
1284                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1285                 break;
1286         case C_STARTING_SYNC_S:
1287                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1288                 break;
1289         }
1290 }
1291
1292 /**
1293  * after_state_ch() - Perform after state change actions that may sleep
1294  * @mdev:       DRBD device.
1295  * @os:         old state.
1296  * @ns:         new state.
1297  * @flags:      Flags
1298  */
1299 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1300                            union drbd_state ns, enum chg_state_flags flags)
1301 {
1302         enum drbd_fencing_p fp;
1303         enum drbd_req_event what = nothing;
1304         union drbd_state nsm = (union drbd_state){ .i = -1 };
1305
1306         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1307                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1308                 if (mdev->p_uuid)
1309                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1310         }
1311
1312         fp = FP_DONT_CARE;
1313         if (get_ldev(mdev)) {
1314                 fp = mdev->ldev->dc.fencing;
1315                 put_ldev(mdev);
1316         }
1317
1318         /* Inform userspace about the change... */
1319         drbd_bcast_state(mdev, ns);
1320
1321         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1322             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1323                 drbd_khelper(mdev, "pri-on-incon-degr");
1324
1325         /* Here we have the actions that are performed after a
1326            state change. This function might sleep */
1327
1328         nsm.i = -1;
1329         if (ns.susp_nod) {
1330                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1331                         if (ns.conn == C_CONNECTED)
1332                                 what = resend, nsm.susp_nod = 0;
1333                         else /* ns.conn > C_CONNECTED */
1334                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1335                 }
1336
1337                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1338                         what = restart_frozen_disk_io, nsm.susp_nod = 0;
1339
1340         }
1341
1342         if (ns.susp_fen) {
1343                 /* case1: The outdate peer handler is successful: */
1344                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1345                         tl_clear(mdev);
1346                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1347                                 drbd_uuid_new_current(mdev);
1348                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1349                         }
1350                         spin_lock_irq(&mdev->req_lock);
1351                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1352                         spin_unlock_irq(&mdev->req_lock);
1353                 }
1354                 /* case2: The connection was established again: */
1355                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1356                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1357                         what = resend;
1358                         nsm.susp_fen = 0;
1359                 }
1360         }
1361
1362         if (what != nothing) {
1363                 spin_lock_irq(&mdev->req_lock);
1364                 _tl_restart(mdev, what);
1365                 nsm.i &= mdev->state.i;
1366                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1367                 spin_unlock_irq(&mdev->req_lock);
1368         }
1369
1370         /* Do not change the order of the if above and the two below... */
1371         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1372                 drbd_send_uuids(mdev);
1373                 drbd_send_state(mdev);
1374         }
1375         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1376                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1377
1378         /* Lost contact to peer's copy of the data */
1379         if ((os.pdsk >= D_INCONSISTENT &&
1380              os.pdsk != D_UNKNOWN &&
1381              os.pdsk != D_OUTDATED)
1382         &&  (ns.pdsk < D_INCONSISTENT ||
1383              ns.pdsk == D_UNKNOWN ||
1384              ns.pdsk == D_OUTDATED)) {
1385                 if (get_ldev(mdev)) {
1386                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1387                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1388                                 if (is_susp(mdev->state)) {
1389                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1390                                 } else {
1391                                         drbd_uuid_new_current(mdev);
1392                                         drbd_send_uuids(mdev);
1393                                 }
1394                         }
1395                         put_ldev(mdev);
1396                 }
1397         }
1398
1399         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1400                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1401                         drbd_uuid_new_current(mdev);
1402                         drbd_send_uuids(mdev);
1403                 }
1404
1405                 /* D_DISKLESS Peer becomes secondary */
1406                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1407                         drbd_al_to_on_disk_bm(mdev);
1408                 put_ldev(mdev);
1409         }
1410
1411         /* Last part of the attaching process ... */
1412         if (ns.conn >= C_CONNECTED &&
1413             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1414                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1415                 drbd_send_uuids(mdev);
1416                 drbd_send_state(mdev);
1417         }
1418
1419         /* We want to pause/continue resync, tell peer. */
1420         if (ns.conn >= C_CONNECTED &&
1421              ((os.aftr_isp != ns.aftr_isp) ||
1422               (os.user_isp != ns.user_isp)))
1423                 drbd_send_state(mdev);
1424
1425         /* In case one of the isp bits got set, suspend other devices. */
1426         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1427             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1428                 suspend_other_sg(mdev);
1429
1430         /* Make sure the peer gets informed about eventual state
1431            changes (ISP bits) while we were in WFReportParams. */
1432         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1433                 drbd_send_state(mdev);
1434
1435         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1436                 drbd_send_state(mdev);
1437
1438         /* We are in the progress to start a full sync... */
1439         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1440             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1441                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1442
1443         /* We are invalidating our self... */
1444         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1445             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1446                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1447
1448         /* first half of local IO error, failure to attach,
1449          * or administrative detach */
1450         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1451                 enum drbd_io_error_p eh;
1452                 int was_io_error;
1453                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1454                  * our cleanup here with the transition to D_DISKLESS,
1455                  * so it is safe to dreference ldev here. */
1456                 eh = mdev->ldev->dc.on_io_error;
1457                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1458
1459                 /* current state still has to be D_FAILED,
1460                  * there is only one way out: to D_DISKLESS,
1461                  * and that may only happen after our put_ldev below. */
1462                 if (mdev->state.disk != D_FAILED)
1463                         dev_err(DEV,
1464                                 "ASSERT FAILED: disk is %s during detach\n",
1465                                 drbd_disk_str(mdev->state.disk));
1466
1467                 if (drbd_send_state(mdev))
1468                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1469                 else
1470                         dev_err(DEV, "Sending state for detaching disk failed\n");
1471
1472                 drbd_rs_cancel_all(mdev);
1473
1474                 /* In case we want to get something to stable storage still,
1475                  * this may be the last chance.
1476                  * Following put_ldev may transition to D_DISKLESS. */
1477                 drbd_md_sync(mdev);
1478                 put_ldev(mdev);
1479
1480                 if (was_io_error && eh == EP_CALL_HELPER)
1481                         drbd_khelper(mdev, "local-io-error");
1482         }
1483
1484         /* second half of local IO error, failure to attach,
1485          * or administrative detach,
1486          * after local_cnt references have reached zero again */
1487         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1488                 /* We must still be diskless,
1489                  * re-attach has to be serialized with this! */
1490                 if (mdev->state.disk != D_DISKLESS)
1491                         dev_err(DEV,
1492                                 "ASSERT FAILED: disk is %s while going diskless\n",
1493                                 drbd_disk_str(mdev->state.disk));
1494
1495                 mdev->rs_total = 0;
1496                 mdev->rs_failed = 0;
1497                 atomic_set(&mdev->rs_pending_cnt, 0);
1498
1499                 if (drbd_send_state(mdev))
1500                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1501                 else
1502                         dev_err(DEV, "Sending state for being diskless failed\n");
1503                 /* corresponding get_ldev in __drbd_set_state
1504                  * this may finaly trigger drbd_ldev_destroy. */
1505                 put_ldev(mdev);
1506         }
1507
1508         /* Disks got bigger while they were detached */
1509         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1510             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1511                 if (ns.conn == C_CONNECTED)
1512                         resync_after_online_grow(mdev);
1513         }
1514
1515         /* A resync finished or aborted, wake paused devices... */
1516         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1517             (os.peer_isp && !ns.peer_isp) ||
1518             (os.user_isp && !ns.user_isp))
1519                 resume_next_sg(mdev);
1520
1521         /* sync target done with resync.  Explicitly notify peer, even though
1522          * it should (at least for non-empty resyncs) already know itself. */
1523         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1524                 drbd_send_state(mdev);
1525
1526         /* free tl_hash if we Got thawed and are C_STANDALONE */
1527         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1528                 drbd_free_tl_hash(mdev);
1529
1530         /* Upon network connection, we need to start the receiver */
1531         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1532                 drbd_thread_start(&mdev->receiver);
1533
1534         /* Terminate worker thread if we are unconfigured - it will be
1535            restarted as needed... */
1536         if (ns.disk == D_DISKLESS &&
1537             ns.conn == C_STANDALONE &&
1538             ns.role == R_SECONDARY) {
1539                 if (os.aftr_isp != ns.aftr_isp)
1540                         resume_next_sg(mdev);
1541                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1542                 if (test_bit(DEVICE_DYING, &mdev->flags))
1543                         drbd_thread_stop_nowait(&mdev->worker);
1544         }
1545
1546         drbd_md_sync(mdev);
1547 }
1548
1549
1550 static int drbd_thread_setup(void *arg)
1551 {
1552         struct drbd_thread *thi = (struct drbd_thread *) arg;
1553         struct drbd_conf *mdev = thi->mdev;
1554         unsigned long flags;
1555         int retval;
1556
1557 restart:
1558         retval = thi->function(thi);
1559
1560         spin_lock_irqsave(&thi->t_lock, flags);
1561
1562         /* if the receiver has been "Exiting", the last thing it did
1563          * was set the conn state to "StandAlone",
1564          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1565          * and receiver thread will be "started".
1566          * drbd_thread_start needs to set "Restarting" in that case.
1567          * t_state check and assignment needs to be within the same spinlock,
1568          * so either thread_start sees Exiting, and can remap to Restarting,
1569          * or thread_start see None, and can proceed as normal.
1570          */
1571
1572         if (thi->t_state == Restarting) {
1573                 dev_info(DEV, "Restarting %s\n", current->comm);
1574                 thi->t_state = Running;
1575                 spin_unlock_irqrestore(&thi->t_lock, flags);
1576                 goto restart;
1577         }
1578
1579         thi->task = NULL;
1580         thi->t_state = None;
1581         smp_mb();
1582         complete(&thi->stop);
1583         spin_unlock_irqrestore(&thi->t_lock, flags);
1584
1585         dev_info(DEV, "Terminating %s\n", current->comm);
1586
1587         /* Release mod reference taken when thread was started */
1588         module_put(THIS_MODULE);
1589         return retval;
1590 }
1591
1592 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1593                       int (*func) (struct drbd_thread *))
1594 {
1595         spin_lock_init(&thi->t_lock);
1596         thi->task    = NULL;
1597         thi->t_state = None;
1598         thi->function = func;
1599         thi->mdev = mdev;
1600 }
1601
1602 int drbd_thread_start(struct drbd_thread *thi)
1603 {
1604         struct drbd_conf *mdev = thi->mdev;
1605         struct task_struct *nt;
1606         unsigned long flags;
1607
1608         const char *me =
1609                 thi == &mdev->receiver ? "receiver" :
1610                 thi == &mdev->asender  ? "asender"  :
1611                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1612
1613         /* is used from state engine doing drbd_thread_stop_nowait,
1614          * while holding the req lock irqsave */
1615         spin_lock_irqsave(&thi->t_lock, flags);
1616
1617         switch (thi->t_state) {
1618         case None:
1619                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1620                                 me, current->comm, current->pid);
1621
1622                 /* Get ref on module for thread - this is released when thread exits */
1623                 if (!try_module_get(THIS_MODULE)) {
1624                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1625                         spin_unlock_irqrestore(&thi->t_lock, flags);
1626                         return false;
1627                 }
1628
1629                 init_completion(&thi->stop);
1630                 D_ASSERT(thi->task == NULL);
1631                 thi->reset_cpu_mask = 1;
1632                 thi->t_state = Running;
1633                 spin_unlock_irqrestore(&thi->t_lock, flags);
1634                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1635
1636                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1637                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1638
1639                 if (IS_ERR(nt)) {
1640                         dev_err(DEV, "Couldn't start thread\n");
1641
1642                         module_put(THIS_MODULE);
1643                         return false;
1644                 }
1645                 spin_lock_irqsave(&thi->t_lock, flags);
1646                 thi->task = nt;
1647                 thi->t_state = Running;
1648                 spin_unlock_irqrestore(&thi->t_lock, flags);
1649                 wake_up_process(nt);
1650                 break;
1651         case Exiting:
1652                 thi->t_state = Restarting;
1653                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1654                                 me, current->comm, current->pid);
1655                 /* fall through */
1656         case Running:
1657         case Restarting:
1658         default:
1659                 spin_unlock_irqrestore(&thi->t_lock, flags);
1660                 break;
1661         }
1662
1663         return true;
1664 }
1665
1666
1667 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1668 {
1669         unsigned long flags;
1670
1671         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1672
1673         /* may be called from state engine, holding the req lock irqsave */
1674         spin_lock_irqsave(&thi->t_lock, flags);
1675
1676         if (thi->t_state == None) {
1677                 spin_unlock_irqrestore(&thi->t_lock, flags);
1678                 if (restart)
1679                         drbd_thread_start(thi);
1680                 return;
1681         }
1682
1683         if (thi->t_state != ns) {
1684                 if (thi->task == NULL) {
1685                         spin_unlock_irqrestore(&thi->t_lock, flags);
1686                         return;
1687                 }
1688
1689                 thi->t_state = ns;
1690                 smp_mb();
1691                 init_completion(&thi->stop);
1692                 if (thi->task != current)
1693                         force_sig(DRBD_SIGKILL, thi->task);
1694
1695         }
1696
1697         spin_unlock_irqrestore(&thi->t_lock, flags);
1698
1699         if (wait)
1700                 wait_for_completion(&thi->stop);
1701 }
1702
1703 #ifdef CONFIG_SMP
1704 /**
1705  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1706  * @mdev:       DRBD device.
1707  *
1708  * Forces all threads of a device onto the same CPU. This is beneficial for
1709  * DRBD's performance. May be overwritten by user's configuration.
1710  */
1711 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1712 {
1713         int ord, cpu;
1714
1715         /* user override. */
1716         if (cpumask_weight(mdev->cpu_mask))
1717                 return;
1718
1719         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1720         for_each_online_cpu(cpu) {
1721                 if (ord-- == 0) {
1722                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1723                         return;
1724                 }
1725         }
1726         /* should not be reached */
1727         cpumask_setall(mdev->cpu_mask);
1728 }
1729
1730 /**
1731  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1732  * @mdev:       DRBD device.
1733  *
1734  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1735  * prematurely.
1736  */
1737 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1738 {
1739         struct task_struct *p = current;
1740         struct drbd_thread *thi =
1741                 p == mdev->asender.task  ? &mdev->asender  :
1742                 p == mdev->receiver.task ? &mdev->receiver :
1743                 p == mdev->worker.task   ? &mdev->worker   :
1744                 NULL;
1745         ERR_IF(thi == NULL)
1746                 return;
1747         if (!thi->reset_cpu_mask)
1748                 return;
1749         thi->reset_cpu_mask = 0;
1750         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1751 }
1752 #endif
1753
1754 /* the appropriate socket mutex must be held already */
1755 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1756                           enum drbd_packets cmd, struct p_header80 *h,
1757                           size_t size, unsigned msg_flags)
1758 {
1759         int sent, ok;
1760
1761         ERR_IF(!h) return false;
1762         ERR_IF(!size) return false;
1763
1764         h->magic   = BE_DRBD_MAGIC;
1765         h->command = cpu_to_be16(cmd);
1766         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1767
1768         sent = drbd_send(mdev, sock, h, size, msg_flags);
1769
1770         ok = (sent == size);
1771         if (!ok)
1772                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1773                     cmdname(cmd), (int)size, sent);
1774         return ok;
1775 }
1776
1777 /* don't pass the socket. we may only look at it
1778  * when we hold the appropriate socket mutex.
1779  */
1780 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1781                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1782 {
1783         int ok = 0;
1784         struct socket *sock;
1785
1786         if (use_data_socket) {
1787                 mutex_lock(&mdev->data.mutex);
1788                 sock = mdev->data.socket;
1789         } else {
1790                 mutex_lock(&mdev->meta.mutex);
1791                 sock = mdev->meta.socket;
1792         }
1793
1794         /* drbd_disconnect() could have called drbd_free_sock()
1795          * while we were waiting in down()... */
1796         if (likely(sock != NULL))
1797                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1798
1799         if (use_data_socket)
1800                 mutex_unlock(&mdev->data.mutex);
1801         else
1802                 mutex_unlock(&mdev->meta.mutex);
1803         return ok;
1804 }
1805
1806 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1807                    size_t size)
1808 {
1809         struct p_header80 h;
1810         int ok;
1811
1812         h.magic   = BE_DRBD_MAGIC;
1813         h.command = cpu_to_be16(cmd);
1814         h.length  = cpu_to_be16(size);
1815
1816         if (!drbd_get_data_sock(mdev))
1817                 return 0;
1818
1819         ok = (sizeof(h) ==
1820                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1821         ok = ok && (size ==
1822                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1823
1824         drbd_put_data_sock(mdev);
1825
1826         return ok;
1827 }
1828
1829 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1830 {
1831         struct p_rs_param_95 *p;
1832         struct socket *sock;
1833         int size, rv;
1834         const int apv = mdev->agreed_pro_version;
1835
1836         size = apv <= 87 ? sizeof(struct p_rs_param)
1837                 : apv == 88 ? sizeof(struct p_rs_param)
1838                         + strlen(mdev->sync_conf.verify_alg) + 1
1839                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1840                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1841
1842         /* used from admin command context and receiver/worker context.
1843          * to avoid kmalloc, grab the socket right here,
1844          * then use the pre-allocated sbuf there */
1845         mutex_lock(&mdev->data.mutex);
1846         sock = mdev->data.socket;
1847
1848         if (likely(sock != NULL)) {
1849                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1850
1851                 p = &mdev->data.sbuf.rs_param_95;
1852
1853                 /* initialize verify_alg and csums_alg */
1854                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1855
1856                 p->rate = cpu_to_be32(sc->rate);
1857                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1858                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1859                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1860                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1861
1862                 if (apv >= 88)
1863                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1864                 if (apv >= 89)
1865                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1866
1867                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1868         } else
1869                 rv = 0; /* not ok */
1870
1871         mutex_unlock(&mdev->data.mutex);
1872
1873         return rv;
1874 }
1875
1876 int drbd_send_protocol(struct drbd_conf *mdev)
1877 {
1878         struct p_protocol *p;
1879         int size, cf, rv;
1880
1881         size = sizeof(struct p_protocol);
1882
1883         if (mdev->agreed_pro_version >= 87)
1884                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1885
1886         /* we must not recurse into our own queue,
1887          * as that is blocked during handshake */
1888         p = kmalloc(size, GFP_NOIO);
1889         if (p == NULL)
1890                 return 0;
1891
1892         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1893         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1894         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1895         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1896         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1897
1898         cf = 0;
1899         if (mdev->net_conf->want_lose)
1900                 cf |= CF_WANT_LOSE;
1901         if (mdev->net_conf->dry_run) {
1902                 if (mdev->agreed_pro_version >= 92)
1903                         cf |= CF_DRY_RUN;
1904                 else {
1905                         dev_err(DEV, "--dry-run is not supported by peer");
1906                         kfree(p);
1907                         return 0;
1908                 }
1909         }
1910         p->conn_flags    = cpu_to_be32(cf);
1911
1912         if (mdev->agreed_pro_version >= 87)
1913                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1914
1915         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1916                            (struct p_header80 *)p, size);
1917         kfree(p);
1918         return rv;
1919 }
1920
1921 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1922 {
1923         struct p_uuids p;
1924         int i;
1925
1926         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1927                 return 1;
1928
1929         for (i = UI_CURRENT; i < UI_SIZE; i++)
1930                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1931
1932         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1933         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1934         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1935         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1936         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1937         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1938
1939         put_ldev(mdev);
1940
1941         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1942                              (struct p_header80 *)&p, sizeof(p));
1943 }
1944
1945 int drbd_send_uuids(struct drbd_conf *mdev)
1946 {
1947         return _drbd_send_uuids(mdev, 0);
1948 }
1949
1950 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1951 {
1952         return _drbd_send_uuids(mdev, 8);
1953 }
1954
1955
1956 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1957 {
1958         struct p_rs_uuid p;
1959
1960         p.uuid = cpu_to_be64(val);
1961
1962         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1963                              (struct p_header80 *)&p, sizeof(p));
1964 }
1965
1966 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1967 {
1968         struct p_sizes p;
1969         sector_t d_size, u_size;
1970         int q_order_type;
1971         int ok;
1972
1973         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1974                 D_ASSERT(mdev->ldev->backing_bdev);
1975                 d_size = drbd_get_max_capacity(mdev->ldev);
1976                 u_size = mdev->ldev->dc.disk_size;
1977                 q_order_type = drbd_queue_order_type(mdev);
1978                 put_ldev(mdev);
1979         } else {
1980                 d_size = 0;
1981                 u_size = 0;
1982                 q_order_type = QUEUE_ORDERED_NONE;
1983         }
1984
1985         p.d_size = cpu_to_be64(d_size);
1986         p.u_size = cpu_to_be64(u_size);
1987         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1988         p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1989         p.queue_order_type = cpu_to_be16(q_order_type);
1990         p.dds_flags = cpu_to_be16(flags);
1991
1992         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1993                            (struct p_header80 *)&p, sizeof(p));
1994         return ok;
1995 }
1996
1997 /**
1998  * drbd_send_state() - Sends the drbd state to the peer
1999  * @mdev:       DRBD device.
2000  */
2001 int drbd_send_state(struct drbd_conf *mdev)
2002 {
2003         struct socket *sock;
2004         struct p_state p;
2005         int ok = 0;
2006
2007         /* Grab state lock so we wont send state if we're in the middle
2008          * of a cluster wide state change on another thread */
2009         drbd_state_lock(mdev);
2010
2011         mutex_lock(&mdev->data.mutex);
2012
2013         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2014         sock = mdev->data.socket;
2015
2016         if (likely(sock != NULL)) {
2017                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2018                                     (struct p_header80 *)&p, sizeof(p), 0);
2019         }
2020
2021         mutex_unlock(&mdev->data.mutex);
2022
2023         drbd_state_unlock(mdev);
2024         return ok;
2025 }
2026
2027 int drbd_send_state_req(struct drbd_conf *mdev,
2028         union drbd_state mask, union drbd_state val)
2029 {
2030         struct p_req_state p;
2031
2032         p.mask    = cpu_to_be32(mask.i);
2033         p.val     = cpu_to_be32(val.i);
2034
2035         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2036                              (struct p_header80 *)&p, sizeof(p));
2037 }
2038
2039 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2040 {
2041         struct p_req_state_reply p;
2042
2043         p.retcode    = cpu_to_be32(retcode);
2044
2045         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2046                              (struct p_header80 *)&p, sizeof(p));
2047 }
2048
2049 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2050         struct p_compressed_bm *p,
2051         struct bm_xfer_ctx *c)
2052 {
2053         struct bitstream bs;
2054         unsigned long plain_bits;
2055         unsigned long tmp;
2056         unsigned long rl;
2057         unsigned len;
2058         unsigned toggle;
2059         int bits;
2060
2061         /* may we use this feature? */
2062         if ((mdev->sync_conf.use_rle == 0) ||
2063                 (mdev->agreed_pro_version < 90))
2064                         return 0;
2065
2066         if (c->bit_offset >= c->bm_bits)
2067                 return 0; /* nothing to do. */
2068
2069         /* use at most thus many bytes */
2070         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2071         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2072         /* plain bits covered in this code string */
2073         plain_bits = 0;
2074
2075         /* p->encoding & 0x80 stores whether the first run length is set.
2076          * bit offset is implicit.
2077          * start with toggle == 2 to be able to tell the first iteration */
2078         toggle = 2;
2079
2080         /* see how much plain bits we can stuff into one packet
2081          * using RLE and VLI. */
2082         do {
2083                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2084                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2085                 if (tmp == -1UL)
2086                         tmp = c->bm_bits;
2087                 rl = tmp - c->bit_offset;
2088
2089                 if (toggle == 2) { /* first iteration */
2090                         if (rl == 0) {
2091                                 /* the first checked bit was set,
2092                                  * store start value, */
2093                                 DCBP_set_start(p, 1);
2094                                 /* but skip encoding of zero run length */
2095                                 toggle = !toggle;
2096                                 continue;
2097                         }
2098                         DCBP_set_start(p, 0);
2099                 }
2100
2101                 /* paranoia: catch zero runlength.
2102                  * can only happen if bitmap is modified while we scan it. */
2103                 if (rl == 0) {
2104                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2105                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2106                         return -1;
2107                 }
2108
2109                 bits = vli_encode_bits(&bs, rl);
2110                 if (bits == -ENOBUFS) /* buffer full */
2111                         break;
2112                 if (bits <= 0) {
2113                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2114                         return 0;
2115                 }
2116
2117                 toggle = !toggle;
2118                 plain_bits += rl;
2119                 c->bit_offset = tmp;
2120         } while (c->bit_offset < c->bm_bits);
2121
2122         len = bs.cur.b - p->code + !!bs.cur.bit;
2123
2124         if (plain_bits < (len << 3)) {
2125                 /* incompressible with this method.
2126                  * we need to rewind both word and bit position. */
2127                 c->bit_offset -= plain_bits;
2128                 bm_xfer_ctx_bit_to_word_offset(c);
2129                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2130                 return 0;
2131         }
2132
2133         /* RLE + VLI was able to compress it just fine.
2134          * update c->word_offset. */
2135         bm_xfer_ctx_bit_to_word_offset(c);
2136
2137         /* store pad_bits */
2138         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2139
2140         return len;
2141 }
2142
2143 enum { OK, FAILED, DONE }
2144 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2145         struct p_header80 *h, struct bm_xfer_ctx *c)
2146 {
2147         struct p_compressed_bm *p = (void*)h;
2148         unsigned long num_words;
2149         int len;
2150         int ok;
2151
2152         len = fill_bitmap_rle_bits(mdev, p, c);
2153
2154         if (len < 0)
2155                 return FAILED;
2156
2157         if (len) {
2158                 DCBP_set_code(p, RLE_VLI_Bits);
2159                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2160                         sizeof(*p) + len, 0);
2161
2162                 c->packets[0]++;
2163                 c->bytes[0] += sizeof(*p) + len;
2164
2165                 if (c->bit_offset >= c->bm_bits)
2166                         len = 0; /* DONE */
2167         } else {
2168                 /* was not compressible.
2169                  * send a buffer full of plain text bits instead. */
2170                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2171                 len = num_words * sizeof(long);
2172                 if (len)
2173                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2174                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2175                                    h, sizeof(struct p_header80) + len, 0);
2176                 c->word_offset += num_words;
2177                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2178
2179                 c->packets[1]++;
2180                 c->bytes[1] += sizeof(struct p_header80) + len;
2181
2182                 if (c->bit_offset > c->bm_bits)
2183                         c->bit_offset = c->bm_bits;
2184         }
2185         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2186
2187         if (ok == DONE)
2188                 INFO_bm_xfer_stats(mdev, "send", c);
2189         return ok;
2190 }
2191
2192 /* See the comment at receive_bitmap() */
2193 int _drbd_send_bitmap(struct drbd_conf *mdev)
2194 {
2195         struct bm_xfer_ctx c;
2196         struct p_header80 *p;
2197         int ret;
2198
2199         ERR_IF(!mdev->bitmap) return false;
2200
2201         /* maybe we should use some per thread scratch page,
2202          * and allocate that during initial device creation? */
2203         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2204         if (!p) {
2205                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2206                 return false;
2207         }
2208
2209         if (get_ldev(mdev)) {
2210                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2211                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2212                         drbd_bm_set_all(mdev);
2213                         if (drbd_bm_write(mdev)) {
2214                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2215                                  * but otherwise process as per normal - need to tell other
2216                                  * side that a full resync is required! */
2217                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2218                         } else {
2219                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2220                                 drbd_md_sync(mdev);
2221                         }
2222                 }
2223                 put_ldev(mdev);
2224         }
2225
2226         c = (struct bm_xfer_ctx) {
2227                 .bm_bits = drbd_bm_bits(mdev),
2228                 .bm_words = drbd_bm_words(mdev),
2229         };
2230
2231         do {
2232                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2233         } while (ret == OK);
2234
2235         free_page((unsigned long) p);
2236         return (ret == DONE);
2237 }
2238
2239 int drbd_send_bitmap(struct drbd_conf *mdev)
2240 {
2241         int err;
2242
2243         if (!drbd_get_data_sock(mdev))
2244                 return -1;
2245         err = !_drbd_send_bitmap(mdev);
2246         drbd_put_data_sock(mdev);
2247         return err;
2248 }
2249
2250 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2251 {
2252         int ok;
2253         struct p_barrier_ack p;
2254
2255         p.barrier  = barrier_nr;
2256         p.set_size = cpu_to_be32(set_size);
2257
2258         if (mdev->state.conn < C_CONNECTED)
2259                 return false;
2260         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2261                         (struct p_header80 *)&p, sizeof(p));
2262         return ok;
2263 }
2264
2265 /**
2266  * _drbd_send_ack() - Sends an ack packet
2267  * @mdev:       DRBD device.
2268  * @cmd:        Packet command code.
2269  * @sector:     sector, needs to be in big endian byte order
2270  * @blksize:    size in byte, needs to be in big endian byte order
2271  * @block_id:   Id, big endian byte order
2272  */
2273 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2274                           u64 sector,
2275                           u32 blksize,
2276                           u64 block_id)
2277 {
2278         int ok;
2279         struct p_block_ack p;
2280
2281         p.sector   = sector;
2282         p.block_id = block_id;
2283         p.blksize  = blksize;
2284         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2285
2286         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2287                 return false;
2288         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2289                                 (struct p_header80 *)&p, sizeof(p));
2290         return ok;
2291 }
2292
2293 /* dp->sector and dp->block_id already/still in network byte order,
2294  * data_size is payload size according to dp->head,
2295  * and may need to be corrected for digest size. */
2296 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2297                      struct p_data *dp, int data_size)
2298 {
2299         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2300                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2301         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2302                               dp->block_id);
2303 }
2304
2305 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2306                      struct p_block_req *rp)
2307 {
2308         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2309 }
2310
2311 /**
2312  * drbd_send_ack() - Sends an ack packet
2313  * @mdev:       DRBD device.
2314  * @cmd:        Packet command code.
2315  * @e:          Epoch entry.
2316  */
2317 int drbd_send_ack(struct drbd_conf *mdev,
2318         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2319 {
2320         return _drbd_send_ack(mdev, cmd,
2321                               cpu_to_be64(e->sector),
2322                               cpu_to_be32(e->size),
2323                               e->block_id);
2324 }
2325
2326 /* This function misuses the block_id field to signal if the blocks
2327  * are is sync or not. */
2328 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2329                      sector_t sector, int blksize, u64 block_id)
2330 {
2331         return _drbd_send_ack(mdev, cmd,
2332                               cpu_to_be64(sector),
2333                               cpu_to_be32(blksize),
2334                               cpu_to_be64(block_id));
2335 }
2336
2337 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2338                        sector_t sector, int size, u64 block_id)
2339 {
2340         int ok;
2341         struct p_block_req p;
2342
2343         p.sector   = cpu_to_be64(sector);
2344         p.block_id = block_id;
2345         p.blksize  = cpu_to_be32(size);
2346
2347         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2348                                 (struct p_header80 *)&p, sizeof(p));
2349         return ok;
2350 }
2351
2352 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2353                             sector_t sector, int size,
2354                             void *digest, int digest_size,
2355                             enum drbd_packets cmd)
2356 {
2357         int ok;
2358         struct p_block_req p;
2359
2360         p.sector   = cpu_to_be64(sector);
2361         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2362         p.blksize  = cpu_to_be32(size);
2363
2364         p.head.magic   = BE_DRBD_MAGIC;
2365         p.head.command = cpu_to_be16(cmd);
2366         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2367
2368         mutex_lock(&mdev->data.mutex);
2369
2370         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2371         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2372
2373         mutex_unlock(&mdev->data.mutex);
2374
2375         return ok;
2376 }
2377
2378 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2379 {
2380         int ok;
2381         struct p_block_req p;
2382
2383         p.sector   = cpu_to_be64(sector);
2384         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2385         p.blksize  = cpu_to_be32(size);
2386
2387         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2388                            (struct p_header80 *)&p, sizeof(p));
2389         return ok;
2390 }
2391
2392 /* called on sndtimeo
2393  * returns false if we should retry,
2394  * true if we think connection is dead
2395  */
2396 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2397 {
2398         int drop_it;
2399         /* long elapsed = (long)(jiffies - mdev->last_received); */
2400
2401         drop_it =   mdev->meta.socket == sock
2402                 || !mdev->asender.task
2403                 || get_t_state(&mdev->asender) != Running
2404                 || mdev->state.conn < C_CONNECTED;
2405
2406         if (drop_it)
2407                 return true;
2408
2409         drop_it = !--mdev->ko_count;
2410         if (!drop_it) {
2411                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2412                        current->comm, current->pid, mdev->ko_count);
2413                 request_ping(mdev);
2414         }
2415
2416         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2417 }
2418
2419 /* The idea of sendpage seems to be to put some kind of reference
2420  * to the page into the skb, and to hand it over to the NIC. In
2421  * this process get_page() gets called.
2422  *
2423  * As soon as the page was really sent over the network put_page()
2424  * gets called by some part of the network layer. [ NIC driver? ]
2425  *
2426  * [ get_page() / put_page() increment/decrement the count. If count
2427  *   reaches 0 the page will be freed. ]
2428  *
2429  * This works nicely with pages from FSs.
2430  * But this means that in protocol A we might signal IO completion too early!
2431  *
2432  * In order not to corrupt data during a resync we must make sure
2433  * that we do not reuse our own buffer pages (EEs) to early, therefore
2434  * we have the net_ee list.
2435  *
2436  * XFS seems to have problems, still, it submits pages with page_count == 0!
2437  * As a workaround, we disable sendpage on pages
2438  * with page_count == 0 or PageSlab.
2439  */
2440 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2441                    int offset, size_t size, unsigned msg_flags)
2442 {
2443         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2444         kunmap(page);
2445         if (sent == size)
2446                 mdev->send_cnt += size>>9;
2447         return sent == size;
2448 }
2449
2450 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2451                     int offset, size_t size, unsigned msg_flags)
2452 {
2453         mm_segment_t oldfs = get_fs();
2454         int sent, ok;
2455         int len = size;
2456
2457         /* e.g. XFS meta- & log-data is in slab pages, which have a
2458          * page_count of 0 and/or have PageSlab() set.
2459          * we cannot use send_page for those, as that does get_page();
2460          * put_page(); and would cause either a VM_BUG directly, or
2461          * __page_cache_release a page that would actually still be referenced
2462          * by someone, leading to some obscure delayed Oops somewhere else. */
2463         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2464                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2465
2466         msg_flags |= MSG_NOSIGNAL;
2467         drbd_update_congested(mdev);
2468         set_fs(KERNEL_DS);
2469         do {
2470                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2471                                                         offset, len,
2472                                                         msg_flags);
2473                 if (sent == -EAGAIN) {
2474                         if (we_should_drop_the_connection(mdev,
2475                                                           mdev->data.socket))
2476                                 break;
2477                         else
2478                                 continue;
2479                 }
2480                 if (sent <= 0) {
2481                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2482                              __func__, (int)size, len, sent);
2483                         break;
2484                 }
2485                 len    -= sent;
2486                 offset += sent;
2487         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2488         set_fs(oldfs);
2489         clear_bit(NET_CONGESTED, &mdev->flags);
2490
2491         ok = (len == 0);
2492         if (likely(ok))
2493                 mdev->send_cnt += size>>9;
2494         return ok;
2495 }
2496
2497 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2498 {
2499         struct bio_vec *bvec;
2500         int i;
2501         /* hint all but last page with MSG_MORE */
2502         __bio_for_each_segment(bvec, bio, i, 0) {
2503                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2504                                      bvec->bv_offset, bvec->bv_len,
2505                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2506                         return 0;
2507         }
2508         return 1;
2509 }
2510
2511 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2512 {
2513         struct bio_vec *bvec;
2514         int i;
2515         /* hint all but last page with MSG_MORE */
2516         __bio_for_each_segment(bvec, bio, i, 0) {
2517                 if (!_drbd_send_page(mdev, bvec->bv_page,
2518                                      bvec->bv_offset, bvec->bv_len,
2519                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2520                         return 0;
2521         }
2522         return 1;
2523 }
2524
2525 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2526 {
2527         struct page *page = e->pages;
2528         unsigned len = e->size;
2529         /* hint all but last page with MSG_MORE */
2530         page_chain_for_each(page) {
2531                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2532                 if (!_drbd_send_page(mdev, page, 0, l,
2533                                 page_chain_next(page) ? MSG_MORE : 0))
2534                         return 0;
2535                 len -= l;
2536         }
2537         return 1;
2538 }
2539
2540 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2541 {
2542         if (mdev->agreed_pro_version >= 95)
2543                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2544                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2545                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2546                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2547         else
2548                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2549 }
2550
2551 /* Used to send write requests
2552  * R_PRIMARY -> Peer    (P_DATA)
2553  */
2554 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2555 {
2556         int ok = 1;
2557         struct p_data p;
2558         unsigned int dp_flags = 0;
2559         void *dgb;
2560         int dgs;
2561
2562         if (!drbd_get_data_sock(mdev))
2563                 return 0;
2564
2565         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2566                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2567
2568         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2569                 p.head.h80.magic   = BE_DRBD_MAGIC;
2570                 p.head.h80.command = cpu_to_be16(P_DATA);
2571                 p.head.h80.length  =
2572                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2573         } else {
2574                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2575                 p.head.h95.command = cpu_to_be16(P_DATA);
2576                 p.head.h95.length  =
2577                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2578         }
2579
2580         p.sector   = cpu_to_be64(req->sector);
2581         p.block_id = (unsigned long)req;
2582         p.seq_num  = cpu_to_be32(req->seq_num =
2583                                  atomic_add_return(1, &mdev->packet_seq));
2584
2585         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2586
2587         if (mdev->state.conn >= C_SYNC_SOURCE &&
2588             mdev->state.conn <= C_PAUSED_SYNC_T)
2589                 dp_flags |= DP_MAY_SET_IN_SYNC;
2590
2591         p.dp_flags = cpu_to_be32(dp_flags);
2592         set_bit(UNPLUG_REMOTE, &mdev->flags);
2593         ok = (sizeof(p) ==
2594                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2595         if (ok && dgs) {
2596                 dgb = mdev->int_dig_out;
2597                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2598                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2599         }
2600         if (ok) {
2601                 /* For protocol A, we have to memcpy the payload into
2602                  * socket buffers, as we may complete right away
2603                  * as soon as we handed it over to tcp, at which point the data
2604                  * pages may become invalid.
2605                  *
2606                  * For data-integrity enabled, we copy it as well, so we can be
2607                  * sure that even if the bio pages may still be modified, it
2608                  * won't change the data on the wire, thus if the digest checks
2609                  * out ok after sending on this side, but does not fit on the
2610                  * receiving side, we sure have detected corruption elsewhere.
2611                  */
2612                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2613                         ok = _drbd_send_bio(mdev, req->master_bio);
2614                 else
2615                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2616
2617                 /* double check digest, sometimes buffers have been modified in flight. */
2618                 if (dgs > 0 && dgs <= 64) {
2619                         /* 64 byte, 512 bit, is the larges digest size
2620                          * currently supported in kernel crypto. */
2621                         unsigned char digest[64];
2622                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2623                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2624                                 dev_warn(DEV,
2625                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2626                                         (unsigned long long)req->sector, req->size);
2627                         }
2628                 } /* else if (dgs > 64) {
2629                      ... Be noisy about digest too large ...
2630                 } */
2631         }
2632
2633         drbd_put_data_sock(mdev);
2634
2635         return ok;
2636 }
2637
2638 /* answer packet, used to send data back for read requests:
2639  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2640  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2641  */
2642 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2643                     struct drbd_epoch_entry *e)
2644 {
2645         int ok;
2646         struct p_data p;
2647         void *dgb;
2648         int dgs;
2649
2650         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2651                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2652
2653         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2654                 p.head.h80.magic   = BE_DRBD_MAGIC;
2655                 p.head.h80.command = cpu_to_be16(cmd);
2656                 p.head.h80.length  =
2657                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2658         } else {
2659                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2660                 p.head.h95.command = cpu_to_be16(cmd);
2661                 p.head.h95.length  =
2662                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2663         }
2664
2665         p.sector   = cpu_to_be64(e->sector);
2666         p.block_id = e->block_id;
2667         /* p.seq_num  = 0;    No sequence numbers here.. */
2668
2669         /* Only called by our kernel thread.
2670          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2671          * in response to admin command or module unload.
2672          */
2673         if (!drbd_get_data_sock(mdev))
2674                 return 0;
2675
2676         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2677         if (ok && dgs) {
2678                 dgb = mdev->int_dig_out;
2679                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2680                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2681         }
2682         if (ok)
2683                 ok = _drbd_send_zc_ee(mdev, e);
2684
2685         drbd_put_data_sock(mdev);
2686
2687         return ok;
2688 }
2689
2690 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2691 {
2692         struct p_block_desc p;
2693
2694         p.sector  = cpu_to_be64(req->sector);
2695         p.blksize = cpu_to_be32(req->size);
2696
2697         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2698 }
2699
2700 /*
2701   drbd_send distinguishes two cases:
2702
2703   Packets sent via the data socket "sock"
2704   and packets sent via the meta data socket "msock"
2705
2706                     sock                      msock
2707   -----------------+-------------------------+------------------------------
2708   timeout           conf.timeout / 2          conf.timeout / 2
2709   timeout action    send a ping via msock     Abort communication
2710                                               and close all sockets
2711 */
2712
2713 /*
2714  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2715  */
2716 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2717               void *buf, size_t size, unsigned msg_flags)
2718 {
2719         struct kvec iov;
2720         struct msghdr msg;
2721         int rv, sent = 0;
2722
2723         if (!sock)
2724                 return -1000;
2725
2726         /* THINK  if (signal_pending) return ... ? */
2727
2728         iov.iov_base = buf;
2729         iov.iov_len  = size;
2730
2731         msg.msg_name       = NULL;
2732         msg.msg_namelen    = 0;
2733         msg.msg_control    = NULL;
2734         msg.msg_controllen = 0;
2735         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2736
2737         if (sock == mdev->data.socket) {
2738                 mdev->ko_count = mdev->net_conf->ko_count;
2739                 drbd_update_congested(mdev);
2740         }
2741         do {
2742                 /* STRANGE
2743                  * tcp_sendmsg does _not_ use its size parameter at all ?
2744                  *
2745                  * -EAGAIN on timeout, -EINTR on signal.
2746                  */
2747 /* THINK
2748  * do we need to block DRBD_SIG if sock == &meta.socket ??
2749  * otherwise wake_asender() might interrupt some send_*Ack !
2750  */
2751                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2752                 if (rv == -EAGAIN) {
2753                         if (we_should_drop_the_connection(mdev, sock))
2754                                 break;
2755                         else
2756                                 continue;
2757                 }
2758                 D_ASSERT(rv != 0);
2759                 if (rv == -EINTR) {
2760                         flush_signals(current);
2761                         rv = 0;
2762                 }
2763                 if (rv < 0)
2764                         break;
2765                 sent += rv;
2766                 iov.iov_base += rv;
2767                 iov.iov_len  -= rv;
2768         } while (sent < size);
2769
2770         if (sock == mdev->data.socket)
2771                 clear_bit(NET_CONGESTED, &mdev->flags);
2772
2773         if (rv <= 0) {
2774                 if (rv != -EAGAIN) {
2775                         dev_err(DEV, "%s_sendmsg returned %d\n",
2776                             sock == mdev->meta.socket ? "msock" : "sock",
2777                             rv);
2778                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2779                 } else
2780                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2781         }
2782
2783         return sent;
2784 }
2785
2786 static int drbd_open(struct block_device *bdev, fmode_t mode)
2787 {
2788         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2789         unsigned long flags;
2790         int rv = 0;
2791
2792         mutex_lock(&drbd_main_mutex);
2793         spin_lock_irqsave(&mdev->req_lock, flags);
2794         /* to have a stable mdev->state.role
2795          * and no race with updating open_cnt */
2796
2797         if (mdev->state.role != R_PRIMARY) {
2798                 if (mode & FMODE_WRITE)
2799                         rv = -EROFS;
2800                 else if (!allow_oos)
2801                         rv = -EMEDIUMTYPE;
2802         }
2803
2804         if (!rv)
2805                 mdev->open_cnt++;
2806         spin_unlock_irqrestore(&mdev->req_lock, flags);
2807         mutex_unlock(&drbd_main_mutex);
2808
2809         return rv;
2810 }
2811
2812 static int drbd_release(struct gendisk *gd, fmode_t mode)
2813 {
2814         struct drbd_conf *mdev = gd->private_data;
2815         mutex_lock(&drbd_main_mutex);
2816         mdev->open_cnt--;
2817         mutex_unlock(&drbd_main_mutex);
2818         return 0;
2819 }
2820
2821 static void drbd_set_defaults(struct drbd_conf *mdev)
2822 {
2823         /* This way we get a compile error when sync_conf grows,
2824            and we forgot to initialize it here */
2825         mdev->sync_conf = (struct syncer_conf) {
2826                 /* .rate = */           DRBD_RATE_DEF,
2827                 /* .after = */          DRBD_AFTER_DEF,
2828                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2829                 /* .verify_alg = */     {}, 0,
2830                 /* .cpu_mask = */       {}, 0,
2831                 /* .csums_alg = */      {}, 0,
2832                 /* .use_rle = */        0,
2833                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2834                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2835                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2836                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2837                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2838                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2839         };
2840
2841         /* Have to use that way, because the layout differs between
2842            big endian and little endian */
2843         mdev->state = (union drbd_state) {
2844                 { .role = R_SECONDARY,
2845                   .peer = R_UNKNOWN,
2846                   .conn = C_STANDALONE,
2847                   .disk = D_DISKLESS,
2848                   .pdsk = D_UNKNOWN,
2849                   .susp = 0,
2850                   .susp_nod = 0,
2851                   .susp_fen = 0
2852                 } };
2853 }
2854
2855 void drbd_init_set_defaults(struct drbd_conf *mdev)
2856 {
2857         /* the memset(,0,) did most of this.
2858          * note: only assignments, no allocation in here */
2859
2860         drbd_set_defaults(mdev);
2861
2862         atomic_set(&mdev->ap_bio_cnt, 0);
2863         atomic_set(&mdev->ap_pending_cnt, 0);
2864         atomic_set(&mdev->rs_pending_cnt, 0);
2865         atomic_set(&mdev->unacked_cnt, 0);
2866         atomic_set(&mdev->local_cnt, 0);
2867         atomic_set(&mdev->net_cnt, 0);
2868         atomic_set(&mdev->packet_seq, 0);
2869         atomic_set(&mdev->pp_in_use, 0);
2870         atomic_set(&mdev->pp_in_use_by_net, 0);
2871         atomic_set(&mdev->rs_sect_in, 0);
2872         atomic_set(&mdev->rs_sect_ev, 0);
2873         atomic_set(&mdev->ap_in_flight, 0);
2874
2875         mutex_init(&mdev->md_io_mutex);
2876         mutex_init(&mdev->data.mutex);
2877         mutex_init(&mdev->meta.mutex);
2878         sema_init(&mdev->data.work.s, 0);
2879         sema_init(&mdev->meta.work.s, 0);
2880         mutex_init(&mdev->state_mutex);
2881
2882         spin_lock_init(&mdev->data.work.q_lock);
2883         spin_lock_init(&mdev->meta.work.q_lock);
2884
2885         spin_lock_init(&mdev->al_lock);
2886         spin_lock_init(&mdev->req_lock);
2887         spin_lock_init(&mdev->peer_seq_lock);
2888         spin_lock_init(&mdev->epoch_lock);
2889
2890         INIT_LIST_HEAD(&mdev->active_ee);
2891         INIT_LIST_HEAD(&mdev->sync_ee);
2892         INIT_LIST_HEAD(&mdev->done_ee);
2893         INIT_LIST_HEAD(&mdev->read_ee);
2894         INIT_LIST_HEAD(&mdev->net_ee);
2895         INIT_LIST_HEAD(&mdev->resync_reads);
2896         INIT_LIST_HEAD(&mdev->data.work.q);
2897         INIT_LIST_HEAD(&mdev->meta.work.q);
2898         INIT_LIST_HEAD(&mdev->resync_work.list);
2899         INIT_LIST_HEAD(&mdev->unplug_work.list);
2900         INIT_LIST_HEAD(&mdev->go_diskless.list);
2901         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2902         INIT_LIST_HEAD(&mdev->start_resync_work.list);
2903         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2904
2905         mdev->resync_work.cb  = w_resync_inactive;
2906         mdev->unplug_work.cb  = w_send_write_hint;
2907         mdev->go_diskless.cb  = w_go_diskless;
2908         mdev->md_sync_work.cb = w_md_sync;
2909         mdev->bm_io_work.w.cb = w_bitmap_io;
2910         init_timer(&mdev->resync_timer);
2911         init_timer(&mdev->md_sync_timer);
2912         mdev->resync_timer.function = resync_timer_fn;
2913         mdev->resync_timer.data = (unsigned long) mdev;
2914         mdev->md_sync_timer.function = md_sync_timer_fn;
2915         mdev->md_sync_timer.data = (unsigned long) mdev;
2916
2917         init_waitqueue_head(&mdev->misc_wait);
2918         init_waitqueue_head(&mdev->state_wait);
2919         init_waitqueue_head(&mdev->net_cnt_wait);
2920         init_waitqueue_head(&mdev->ee_wait);
2921         init_waitqueue_head(&mdev->al_wait);
2922         init_waitqueue_head(&mdev->seq_wait);
2923
2924         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2925         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2926         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2927
2928         mdev->agreed_pro_version = PRO_VERSION_MAX;
2929         mdev->write_ordering = WO_bdev_flush;
2930         mdev->resync_wenr = LC_FREE;
2931 }
2932
2933 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2934 {
2935         int i;
2936         if (mdev->receiver.t_state != None)
2937                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2938                                 mdev->receiver.t_state);
2939
2940         /* no need to lock it, I'm the only thread alive */
2941         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2942                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2943         mdev->al_writ_cnt  =
2944         mdev->bm_writ_cnt  =
2945         mdev->read_cnt     =
2946         mdev->recv_cnt     =
2947         mdev->send_cnt     =
2948         mdev->writ_cnt     =
2949         mdev->p_size       =
2950         mdev->rs_start     =
2951         mdev->rs_total     =
2952         mdev->rs_failed    = 0;
2953         mdev->rs_last_events = 0;
2954         mdev->rs_last_sect_ev = 0;
2955         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2956                 mdev->rs_mark_left[i] = 0;
2957                 mdev->rs_mark_time[i] = 0;
2958         }
2959         D_ASSERT(mdev->net_conf == NULL);
2960
2961         drbd_set_my_capacity(mdev, 0);
2962         if (mdev->bitmap) {
2963                 /* maybe never allocated. */
2964                 drbd_bm_resize(mdev, 0, 1);
2965                 drbd_bm_cleanup(mdev);
2966         }
2967
2968         drbd_free_resources(mdev);
2969         clear_bit(AL_SUSPENDED, &mdev->flags);
2970
2971         /*
2972          * currently we drbd_init_ee only on module load, so
2973          * we may do drbd_release_ee only on module unload!
2974          */
2975         D_ASSERT(list_empty(&mdev->active_ee));
2976         D_ASSERT(list_empty(&mdev->sync_ee));
2977         D_ASSERT(list_empty(&mdev->done_ee));
2978         D_ASSERT(list_empty(&mdev->read_ee));
2979         D_ASSERT(list_empty(&mdev->net_ee));
2980         D_ASSERT(list_empty(&mdev->resync_reads));
2981         D_ASSERT(list_empty(&mdev->data.work.q));
2982         D_ASSERT(list_empty(&mdev->meta.work.q));
2983         D_ASSERT(list_empty(&mdev->resync_work.list));
2984         D_ASSERT(list_empty(&mdev->unplug_work.list));
2985         D_ASSERT(list_empty(&mdev->go_diskless.list));
2986 }
2987
2988
2989 static void drbd_destroy_mempools(void)
2990 {
2991         struct page *page;
2992
2993         while (drbd_pp_pool) {
2994                 page = drbd_pp_pool;
2995                 drbd_pp_pool = (struct page *)page_private(page);
2996                 __free_page(page);
2997                 drbd_pp_vacant--;
2998         }
2999
3000         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3001
3002         if (drbd_ee_mempool)
3003                 mempool_destroy(drbd_ee_mempool);
3004         if (drbd_request_mempool)
3005                 mempool_destroy(drbd_request_mempool);
3006         if (drbd_ee_cache)
3007                 kmem_cache_destroy(drbd_ee_cache);
3008         if (drbd_request_cache)
3009                 kmem_cache_destroy(drbd_request_cache);
3010         if (drbd_bm_ext_cache)
3011                 kmem_cache_destroy(drbd_bm_ext_cache);
3012         if (drbd_al_ext_cache)
3013                 kmem_cache_destroy(drbd_al_ext_cache);
3014
3015         drbd_ee_mempool      = NULL;
3016         drbd_request_mempool = NULL;
3017         drbd_ee_cache        = NULL;
3018         drbd_request_cache   = NULL;
3019         drbd_bm_ext_cache    = NULL;
3020         drbd_al_ext_cache    = NULL;
3021
3022         return;
3023 }
3024
3025 static int drbd_create_mempools(void)
3026 {
3027         struct page *page;
3028         const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3029         int i;
3030
3031         /* prepare our caches and mempools */
3032         drbd_request_mempool = NULL;
3033         drbd_ee_cache        = NULL;
3034         drbd_request_cache   = NULL;
3035         drbd_bm_ext_cache    = NULL;
3036         drbd_al_ext_cache    = NULL;
3037         drbd_pp_pool         = NULL;
3038
3039         /* caches */
3040         drbd_request_cache = kmem_cache_create(
3041                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3042         if (drbd_request_cache == NULL)
3043                 goto Enomem;
3044
3045         drbd_ee_cache = kmem_cache_create(
3046                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3047         if (drbd_ee_cache == NULL)
3048                 goto Enomem;
3049
3050         drbd_bm_ext_cache = kmem_cache_create(
3051                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3052         if (drbd_bm_ext_cache == NULL)
3053                 goto Enomem;
3054
3055         drbd_al_ext_cache = kmem_cache_create(
3056                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3057         if (drbd_al_ext_cache == NULL)
3058                 goto Enomem;
3059
3060         /* mempools */
3061         drbd_request_mempool = mempool_create(number,
3062                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3063         if (drbd_request_mempool == NULL)
3064                 goto Enomem;
3065
3066         drbd_ee_mempool = mempool_create(number,
3067                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3068         if (drbd_ee_mempool == NULL)
3069                 goto Enomem;
3070
3071         /* drbd's page pool */
3072         spin_lock_init(&drbd_pp_lock);
3073
3074         for (i = 0; i < number; i++) {
3075                 page = alloc_page(GFP_HIGHUSER);
3076                 if (!page)
3077                         goto Enomem;
3078                 set_page_private(page, (unsigned long)drbd_pp_pool);
3079                 drbd_pp_pool = page;
3080         }
3081         drbd_pp_vacant = number;
3082
3083         return 0;
3084
3085 Enomem:
3086         drbd_destroy_mempools(); /* in case we allocated some */
3087         return -ENOMEM;
3088 }
3089
3090 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3091         void *unused)
3092 {
3093         /* just so we have it.  you never know what interesting things we
3094          * might want to do here some day...
3095          */
3096
3097         return NOTIFY_DONE;
3098 }
3099
3100 static struct notifier_block drbd_notifier = {
3101         .notifier_call = drbd_notify_sys,
3102 };
3103
3104 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3105 {
3106         int rr;
3107
3108         rr = drbd_release_ee(mdev, &mdev->active_ee);
3109         if (rr)
3110                 dev_err(DEV, "%d EEs in active list found!\n", rr);
3111
3112         rr = drbd_release_ee(mdev, &mdev->sync_ee);
3113         if (rr)
3114                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3115
3116         rr = drbd_release_ee(mdev, &mdev->read_ee);
3117         if (rr)
3118                 dev_err(DEV, "%d EEs in read list found!\n", rr);
3119
3120         rr = drbd_release_ee(mdev, &mdev->done_ee);
3121         if (rr)
3122                 dev_err(DEV, "%d EEs in done list found!\n", rr);
3123
3124         rr = drbd_release_ee(mdev, &mdev->net_ee);
3125         if (rr)
3126                 dev_err(DEV, "%d EEs in net list found!\n", rr);
3127 }
3128
3129 /* caution. no locking.
3130  * currently only used from module cleanup code. */
3131 static void drbd_delete_device(unsigned int minor)
3132 {
3133         struct drbd_conf *mdev = minor_to_mdev(minor);
3134
3135         if (!mdev)
3136                 return;
3137
3138         /* paranoia asserts */
3139         if (mdev->open_cnt != 0)
3140                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3141                                 __FILE__ , __LINE__);
3142
3143         ERR_IF (!list_empty(&mdev->data.work.q)) {
3144                 struct list_head *lp;
3145                 list_for_each(lp, &mdev->data.work.q) {
3146                         dev_err(DEV, "lp = %p\n", lp);
3147                 }
3148         };
3149         /* end paranoia asserts */
3150
3151         del_gendisk(mdev->vdisk);
3152
3153         /* cleanup stuff that may have been allocated during
3154          * device (re-)configuration or state changes */
3155
3156         if (mdev->this_bdev)
3157                 bdput(mdev->this_bdev);
3158
3159         drbd_free_resources(mdev);
3160
3161         drbd_release_ee_lists(mdev);
3162
3163         /* should be free'd on disconnect? */
3164         kfree(mdev->ee_hash);
3165         /*
3166         mdev->ee_hash_s = 0;
3167         mdev->ee_hash = NULL;
3168         */
3169
3170         lc_destroy(mdev->act_log);
3171         lc_destroy(mdev->resync);
3172
3173         kfree(mdev->p_uuid);
3174         /* mdev->p_uuid = NULL; */
3175
3176         kfree(mdev->int_dig_out);
3177         kfree(mdev->int_dig_in);
3178         kfree(mdev->int_dig_vv);
3179
3180         /* cleanup the rest that has been
3181          * allocated from drbd_new_device
3182          * and actually free the mdev itself */
3183         drbd_free_mdev(mdev);
3184 }
3185
3186 static void drbd_cleanup(void)
3187 {
3188         unsigned int i;
3189
3190         unregister_reboot_notifier(&drbd_notifier);
3191
3192         /* first remove proc,
3193          * drbdsetup uses it's presence to detect
3194          * whether DRBD is loaded.
3195          * If we would get stuck in proc removal,
3196          * but have netlink already deregistered,
3197          * some drbdsetup commands may wait forever
3198          * for an answer.
3199          */
3200         if (drbd_proc)
3201                 remove_proc_entry("drbd", NULL);
3202
3203         drbd_nl_cleanup();
3204
3205         if (minor_table) {
3206                 i = minor_count;
3207                 while (i--)
3208                         drbd_delete_device(i);
3209                 drbd_destroy_mempools();
3210         }
3211
3212         kfree(minor_table);
3213
3214         unregister_blkdev(DRBD_MAJOR, "drbd");
3215
3216         printk(KERN_INFO "drbd: module cleanup done.\n");
3217 }
3218
3219 /**
3220  * drbd_congested() - Callback for pdflush
3221  * @congested_data:     User data
3222  * @bdi_bits:           Bits pdflush is currently interested in
3223  *
3224  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3225  */
3226 static int drbd_congested(void *congested_data, int bdi_bits)
3227 {
3228         struct drbd_conf *mdev = congested_data;
3229         struct request_queue *q;
3230         char reason = '-';
3231         int r = 0;
3232
3233         if (!__inc_ap_bio_cond(mdev)) {
3234                 /* DRBD has frozen IO */
3235                 r = bdi_bits;
3236                 reason = 'd';
3237                 goto out;
3238         }
3239
3240         if (get_ldev(mdev)) {
3241                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3242                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3243                 put_ldev(mdev);
3244                 if (r)
3245                         reason = 'b';
3246         }
3247
3248         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3249                 r |= (1 << BDI_async_congested);
3250                 reason = reason == 'b' ? 'a' : 'n';
3251         }
3252
3253 out:
3254         mdev->congestion_reason = reason;
3255         return r;
3256 }
3257
3258 struct drbd_conf *drbd_new_device(unsigned int minor)
3259 {
3260         struct drbd_conf *mdev;
3261         struct gendisk *disk;
3262         struct request_queue *q;
3263
3264         /* GFP_KERNEL, we are outside of all write-out paths */
3265         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3266         if (!mdev)
3267                 return NULL;
3268         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3269                 goto out_no_cpumask;
3270
3271         mdev->minor = minor;
3272
3273         drbd_init_set_defaults(mdev);
3274
3275         q = blk_alloc_queue(GFP_KERNEL);
3276         if (!q)
3277                 goto out_no_q;
3278         mdev->rq_queue = q;
3279         q->queuedata   = mdev;
3280
3281         disk = alloc_disk(1);
3282         if (!disk)
3283                 goto out_no_disk;
3284         mdev->vdisk = disk;
3285
3286         set_disk_ro(disk, true);
3287
3288         disk->queue = q;
3289         disk->major = DRBD_MAJOR;
3290         disk->first_minor = minor;
3291         disk->fops = &drbd_ops;
3292         sprintf(disk->disk_name, "drbd%d", minor);
3293         disk->private_data = mdev;
3294
3295         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3296         /* we have no partitions. we contain only ourselves. */
3297         mdev->this_bdev->bd_contains = mdev->this_bdev;
3298
3299         q->backing_dev_info.congested_fn = drbd_congested;
3300         q->backing_dev_info.congested_data = mdev;
3301
3302         blk_queue_make_request(q, drbd_make_request);
3303         blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3304         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3305         blk_queue_merge_bvec(q, drbd_merge_bvec);
3306         q->queue_lock = &mdev->req_lock;
3307
3308         mdev->md_io_page = alloc_page(GFP_KERNEL);
3309         if (!mdev->md_io_page)
3310                 goto out_no_io_page;
3311
3312         if (drbd_bm_init(mdev))
3313                 goto out_no_bitmap;
3314         /* no need to lock access, we are still initializing this minor device. */
3315         if (!tl_init(mdev))
3316                 goto out_no_tl;
3317
3318         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3319         if (!mdev->app_reads_hash)
3320                 goto out_no_app_reads;
3321
3322         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3323         if (!mdev->current_epoch)
3324                 goto out_no_epoch;
3325
3326         INIT_LIST_HEAD(&mdev->current_epoch->list);
3327         mdev->epochs = 1;
3328
3329         return mdev;
3330
3331 /* out_whatever_else:
3332         kfree(mdev->current_epoch); */
3333 out_no_epoch:
3334         kfree(mdev->app_reads_hash);
3335 out_no_app_reads:
3336         tl_cleanup(mdev);
3337 out_no_tl:
3338         drbd_bm_cleanup(mdev);
3339 out_no_bitmap:
3340         __free_page(mdev->md_io_page);
3341 out_no_io_page:
3342         put_disk(disk);
3343 out_no_disk:
3344         blk_cleanup_queue(q);
3345 out_no_q:
3346         free_cpumask_var(mdev->cpu_mask);
3347 out_no_cpumask:
3348         kfree(mdev);
3349         return NULL;
3350 }
3351
3352 /* counterpart of drbd_new_device.
3353  * last part of drbd_delete_device. */
3354 void drbd_free_mdev(struct drbd_conf *mdev)
3355 {
3356         kfree(mdev->current_epoch);
3357         kfree(mdev->app_reads_hash);
3358         tl_cleanup(mdev);
3359         if (mdev->bitmap) /* should no longer be there. */
3360                 drbd_bm_cleanup(mdev);
3361         __free_page(mdev->md_io_page);
3362         put_disk(mdev->vdisk);
3363         blk_cleanup_queue(mdev->rq_queue);
3364         free_cpumask_var(mdev->cpu_mask);
3365         drbd_free_tl_hash(mdev);
3366         kfree(mdev);
3367 }
3368
3369
3370 int __init drbd_init(void)
3371 {
3372         int err;
3373
3374         if (sizeof(struct p_handshake) != 80) {
3375                 printk(KERN_ERR
3376                        "drbd: never change the size or layout "
3377                        "of the HandShake packet.\n");
3378                 return -EINVAL;
3379         }
3380
3381         if (1 > minor_count || minor_count > 255) {
3382                 printk(KERN_ERR
3383                         "drbd: invalid minor_count (%d)\n", minor_count);
3384 #ifdef MODULE
3385                 return -EINVAL;
3386 #else
3387                 minor_count = 8;
3388 #endif
3389         }
3390
3391         err = drbd_nl_init();
3392         if (err)
3393                 return err;
3394
3395         err = register_blkdev(DRBD_MAJOR, "drbd");
3396         if (err) {
3397                 printk(KERN_ERR
3398                        "drbd: unable to register block device major %d\n",
3399                        DRBD_MAJOR);
3400                 return err;
3401         }
3402
3403         register_reboot_notifier(&drbd_notifier);
3404
3405         /*
3406          * allocate all necessary structs
3407          */
3408         err = -ENOMEM;
3409
3410         init_waitqueue_head(&drbd_pp_wait);
3411
3412         drbd_proc = NULL; /* play safe for drbd_cleanup */
3413         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3414                                 GFP_KERNEL);
3415         if (!minor_table)
3416                 goto Enomem;
3417
3418         err = drbd_create_mempools();
3419         if (err)
3420                 goto Enomem;
3421
3422         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3423         if (!drbd_proc) {
3424                 printk(KERN_ERR "drbd: unable to register proc file\n");
3425                 goto Enomem;
3426         }
3427
3428         rwlock_init(&global_state_lock);
3429
3430         printk(KERN_INFO "drbd: initialized. "
3431                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3432                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3433         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3434         printk(KERN_INFO "drbd: registered as block device major %d\n",
3435                 DRBD_MAJOR);
3436         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3437
3438         return 0; /* Success! */
3439
3440 Enomem:
3441         drbd_cleanup();
3442         if (err == -ENOMEM)
3443                 /* currently always the case */
3444                 printk(KERN_ERR "drbd: ran out of memory\n");
3445         else
3446                 printk(KERN_ERR "drbd: initialization failure\n");
3447         return err;
3448 }
3449
3450 void drbd_free_bc(struct drbd_backing_dev *ldev)
3451 {
3452         if (ldev == NULL)
3453                 return;
3454
3455         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3456         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3457
3458         kfree(ldev);
3459 }
3460
3461 void drbd_free_sock(struct drbd_conf *mdev)
3462 {
3463         if (mdev->data.socket) {
3464                 mutex_lock(&mdev->data.mutex);
3465                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3466                 sock_release(mdev->data.socket);
3467                 mdev->data.socket = NULL;
3468                 mutex_unlock(&mdev->data.mutex);
3469         }
3470         if (mdev->meta.socket) {
3471                 mutex_lock(&mdev->meta.mutex);
3472                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3473                 sock_release(mdev->meta.socket);
3474                 mdev->meta.socket = NULL;
3475                 mutex_unlock(&mdev->meta.mutex);
3476         }
3477 }
3478
3479
3480 void drbd_free_resources(struct drbd_conf *mdev)
3481 {
3482         crypto_free_hash(mdev->csums_tfm);
3483         mdev->csums_tfm = NULL;
3484         crypto_free_hash(mdev->verify_tfm);
3485         mdev->verify_tfm = NULL;
3486         crypto_free_hash(mdev->cram_hmac_tfm);
3487         mdev->cram_hmac_tfm = NULL;
3488         crypto_free_hash(mdev->integrity_w_tfm);
3489         mdev->integrity_w_tfm = NULL;
3490         crypto_free_hash(mdev->integrity_r_tfm);
3491         mdev->integrity_r_tfm = NULL;
3492
3493         drbd_free_sock(mdev);
3494
3495         __no_warn(local,
3496                   drbd_free_bc(mdev->ldev);
3497                   mdev->ldev = NULL;);
3498 }
3499
3500 /* meta data management */
3501
3502 struct meta_data_on_disk {
3503         u64 la_size;           /* last agreed size. */
3504         u64 uuid[UI_SIZE];   /* UUIDs. */
3505         u64 device_uuid;
3506         u64 reserved_u64_1;
3507         u32 flags;             /* MDF */
3508         u32 magic;
3509         u32 md_size_sect;
3510         u32 al_offset;         /* offset to this block */
3511         u32 al_nr_extents;     /* important for restoring the AL */
3512               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3513         u32 bm_offset;         /* offset to the bitmap, from here */
3514         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3515         u32 reserved_u32[4];
3516
3517 } __packed;
3518
3519 /**
3520  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3521  * @mdev:       DRBD device.
3522  */
3523 void drbd_md_sync(struct drbd_conf *mdev)
3524 {
3525         struct meta_data_on_disk *buffer;
3526         sector_t sector;
3527         int i;
3528
3529         del_timer(&mdev->md_sync_timer);
3530         /* timer may be rearmed by drbd_md_mark_dirty() now. */
3531         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3532                 return;
3533
3534         /* We use here D_FAILED and not D_ATTACHING because we try to write
3535          * metadata even if we detach due to a disk failure! */
3536         if (!get_ldev_if_state(mdev, D_FAILED))
3537                 return;
3538
3539         mutex_lock(&mdev->md_io_mutex);
3540         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3541         memset(buffer, 0, 512);
3542
3543         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3544         for (i = UI_CURRENT; i < UI_SIZE; i++)
3545                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3546         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3547         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3548
3549         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3550         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3551         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3552         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3553         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3554
3555         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3556
3557         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3558         sector = mdev->ldev->md.md_offset;
3559
3560         if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3561                 /* this was a try anyways ... */
3562                 dev_err(DEV, "meta data update failed!\n");
3563                 drbd_chk_io_error(mdev, 1, true);
3564         }
3565
3566         /* Update mdev->ldev->md.la_size_sect,
3567          * since we updated it on metadata. */
3568         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3569
3570         mutex_unlock(&mdev->md_io_mutex);
3571         put_ldev(mdev);
3572 }
3573
3574 /**
3575  * drbd_md_read() - Reads in the meta data super block
3576  * @mdev:       DRBD device.
3577  * @bdev:       Device from which the meta data should be read in.
3578  *
3579  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3580  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3581  */
3582 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3583 {
3584         struct meta_data_on_disk *buffer;
3585         int i, rv = NO_ERROR;
3586
3587         if (!get_ldev_if_state(mdev, D_ATTACHING))
3588                 return ERR_IO_MD_DISK;
3589
3590         mutex_lock(&mdev->md_io_mutex);
3591         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3592
3593         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3594                 /* NOTE: cant do normal error processing here as this is
3595                    called BEFORE disk is attached */
3596                 dev_err(DEV, "Error while reading metadata.\n");
3597                 rv = ERR_IO_MD_DISK;
3598                 goto err;
3599         }
3600
3601         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3602                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3603                 rv = ERR_MD_INVALID;
3604                 goto err;
3605         }
3606         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3607                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3608                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3609                 rv = ERR_MD_INVALID;
3610                 goto err;
3611         }
3612         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3613                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3614                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3615                 rv = ERR_MD_INVALID;
3616                 goto err;
3617         }
3618         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3619                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3620                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3621                 rv = ERR_MD_INVALID;
3622                 goto err;
3623         }
3624
3625         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3626                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3627                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3628                 rv = ERR_MD_INVALID;
3629                 goto err;
3630         }
3631
3632         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3633         for (i = UI_CURRENT; i < UI_SIZE; i++)
3634                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3635         bdev->md.flags = be32_to_cpu(buffer->flags);
3636         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3637         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3638
3639         if (mdev->sync_conf.al_extents < 7)
3640                 mdev->sync_conf.al_extents = 127;
3641
3642  err:
3643         mutex_unlock(&mdev->md_io_mutex);
3644         put_ldev(mdev);
3645
3646         return rv;
3647 }
3648
3649 static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3650 {
3651         static char *uuid_str[UI_EXTENDED_SIZE] = {
3652                 [UI_CURRENT] = "CURRENT",
3653                 [UI_BITMAP] = "BITMAP",
3654                 [UI_HISTORY_START] = "HISTORY_START",
3655                 [UI_HISTORY_END] = "HISTORY_END",
3656                 [UI_SIZE] = "SIZE",
3657                 [UI_FLAGS] = "FLAGS",
3658         };
3659
3660         if (index >= UI_EXTENDED_SIZE) {
3661                 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3662                 return;
3663         }
3664
3665         dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3666                  uuid_str[index],
3667                  (unsigned long long)mdev->ldev->md.uuid[index]);
3668 }
3669
3670
3671 /**
3672  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3673  * @mdev:       DRBD device.
3674  *
3675  * Call this function if you change anything that should be written to
3676  * the meta-data super block. This function sets MD_DIRTY, and starts a
3677  * timer that ensures that within five seconds you have to call drbd_md_sync().
3678  */
3679 #ifdef DEBUG
3680 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3681 {
3682         if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3683                 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3684                 mdev->last_md_mark_dirty.line = line;
3685                 mdev->last_md_mark_dirty.func = func;
3686         }
3687 }
3688 #else
3689 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3690 {
3691         if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3692                 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3693 }
3694 #endif
3695
3696 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3697 {
3698         int i;
3699
3700         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3701                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3702                 debug_drbd_uuid(mdev, i+1);
3703         }
3704 }
3705
3706 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3707 {
3708         if (idx == UI_CURRENT) {
3709                 if (mdev->state.role == R_PRIMARY)
3710                         val |= 1;
3711                 else
3712                         val &= ~((u64)1);
3713
3714                 drbd_set_ed_uuid(mdev, val);
3715         }
3716
3717         mdev->ldev->md.uuid[idx] = val;
3718         debug_drbd_uuid(mdev, idx);
3719         drbd_md_mark_dirty(mdev);
3720 }
3721
3722
3723 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3724 {
3725         if (mdev->ldev->md.uuid[idx]) {
3726                 drbd_uuid_move_history(mdev);
3727                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3728                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3729         }
3730         _drbd_uuid_set(mdev, idx, val);
3731 }
3732
3733 /**
3734  * drbd_uuid_new_current() - Creates a new current UUID
3735  * @mdev:       DRBD device.
3736  *
3737  * Creates a new current UUID, and rotates the old current UUID into
3738  * the bitmap slot. Causes an incremental resync upon next connect.
3739  */
3740 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3741 {
3742         u64 val;
3743
3744         dev_info(DEV, "Creating new current UUID\n");
3745         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3746         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3747         debug_drbd_uuid(mdev, UI_BITMAP);
3748
3749         get_random_bytes(&val, sizeof(u64));
3750         _drbd_uuid_set(mdev, UI_CURRENT, val);
3751         /* get it to stable storage _now_ */
3752         drbd_md_sync(mdev);
3753 }
3754
3755 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3756 {
3757         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3758                 return;
3759
3760         if (val == 0) {
3761                 drbd_uuid_move_history(mdev);
3762                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3763                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3764                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3765                 debug_drbd_uuid(mdev, UI_BITMAP);
3766         } else {
3767                 if (mdev->ldev->md.uuid[UI_BITMAP])
3768                         dev_warn(DEV, "bm UUID already set");
3769
3770                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3771                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3772
3773                 debug_drbd_uuid(mdev, UI_BITMAP);
3774         }
3775         drbd_md_mark_dirty(mdev);
3776 }
3777
3778 /**
3779  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3780  * @mdev:       DRBD device.
3781  *
3782  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3783  */
3784 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3785 {
3786         int rv = -EIO;
3787
3788         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3789                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3790                 drbd_md_sync(mdev);
3791                 drbd_bm_set_all(mdev);
3792
3793                 rv = drbd_bm_write(mdev);
3794
3795                 if (!rv) {
3796                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3797                         drbd_md_sync(mdev);
3798                 }
3799
3800                 put_ldev(mdev);
3801         }
3802
3803         return rv;
3804 }
3805
3806 /**
3807  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3808  * @mdev:       DRBD device.
3809  *
3810  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3811  */
3812 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3813 {
3814         int rv = -EIO;
3815
3816         drbd_resume_al(mdev);
3817         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3818                 drbd_bm_clear_all(mdev);
3819                 rv = drbd_bm_write(mdev);
3820                 put_ldev(mdev);
3821         }
3822
3823         return rv;
3824 }
3825
3826 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3827 {
3828         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3829         int rv;
3830
3831         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3832
3833         drbd_bm_lock(mdev, work->why);
3834         rv = work->io_fn(mdev);
3835         drbd_bm_unlock(mdev);
3836
3837         clear_bit(BITMAP_IO, &mdev->flags);
3838         smp_mb__after_clear_bit();
3839         wake_up(&mdev->misc_wait);
3840
3841         if (work->done)
3842                 work->done(mdev, rv);
3843
3844         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3845         work->why = NULL;
3846
3847         return 1;
3848 }
3849
3850 void drbd_ldev_destroy(struct drbd_conf *mdev)
3851 {
3852         lc_destroy(mdev->resync);
3853         mdev->resync = NULL;
3854         lc_destroy(mdev->act_log);
3855         mdev->act_log = NULL;
3856         __no_warn(local,
3857                 drbd_free_bc(mdev->ldev);
3858                 mdev->ldev = NULL;);
3859
3860         if (mdev->md_io_tmpp) {
3861                 __free_page(mdev->md_io_tmpp);
3862                 mdev->md_io_tmpp = NULL;
3863         }
3864         clear_bit(GO_DISKLESS, &mdev->flags);
3865 }
3866
3867 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3868 {
3869         D_ASSERT(mdev->state.disk == D_FAILED);
3870         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3871          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3872          * the protected members anymore, though, so once put_ldev reaches zero
3873          * again, it will be safe to free them. */
3874         drbd_force_state(mdev, NS(disk, D_DISKLESS));
3875         return 1;
3876 }
3877
3878 void drbd_go_diskless(struct drbd_conf *mdev)
3879 {
3880         D_ASSERT(mdev->state.disk == D_FAILED);
3881         if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3882                 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3883 }
3884
3885 /**
3886  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3887  * @mdev:       DRBD device.
3888  * @io_fn:      IO callback to be called when bitmap IO is possible
3889  * @done:       callback to be called after the bitmap IO was performed
3890  * @why:        Descriptive text of the reason for doing the IO
3891  *
3892  * While IO on the bitmap happens we freeze application IO thus we ensure
3893  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3894  * called from worker context. It MUST NOT be used while a previous such
3895  * work is still pending!
3896  */
3897 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3898                           int (*io_fn)(struct drbd_conf *),
3899                           void (*done)(struct drbd_conf *, int),
3900                           char *why)
3901 {
3902         D_ASSERT(current == mdev->worker.task);
3903
3904         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3905         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3906         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3907         if (mdev->bm_io_work.why)
3908                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3909                         why, mdev->bm_io_work.why);
3910
3911         mdev->bm_io_work.io_fn = io_fn;
3912         mdev->bm_io_work.done = done;
3913         mdev->bm_io_work.why = why;
3914
3915         spin_lock_irq(&mdev->req_lock);
3916         set_bit(BITMAP_IO, &mdev->flags);
3917         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3918                 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3919                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3920         }
3921         spin_unlock_irq(&mdev->req_lock);
3922 }
3923
3924 /**
3925  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3926  * @mdev:       DRBD device.
3927  * @io_fn:      IO callback to be called when bitmap IO is possible
3928  * @why:        Descriptive text of the reason for doing the IO
3929  *
3930  * freezes application IO while that the actual IO operations runs. This
3931  * functions MAY NOT be called from worker context.
3932  */
3933 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3934 {
3935         int rv;
3936
3937         D_ASSERT(current != mdev->worker.task);
3938
3939         drbd_suspend_io(mdev);
3940
3941         drbd_bm_lock(mdev, why);
3942         rv = io_fn(mdev);
3943         drbd_bm_unlock(mdev);
3944
3945         drbd_resume_io(mdev);
3946
3947         return rv;
3948 }
3949
3950 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3951 {
3952         if ((mdev->ldev->md.flags & flag) != flag) {
3953                 drbd_md_mark_dirty(mdev);
3954                 mdev->ldev->md.flags |= flag;
3955         }
3956 }
3957
3958 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3959 {
3960         if ((mdev->ldev->md.flags & flag) != 0) {
3961                 drbd_md_mark_dirty(mdev);
3962                 mdev->ldev->md.flags &= ~flag;
3963         }
3964 }
3965 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3966 {
3967         return (bdev->md.flags & flag) != 0;
3968 }
3969
3970 static void md_sync_timer_fn(unsigned long data)
3971 {
3972         struct drbd_conf *mdev = (struct drbd_conf *) data;
3973
3974         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3975 }
3976
3977 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3978 {
3979         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3980 #ifdef DEBUG
3981         dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3982                 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3983 #endif
3984         drbd_md_sync(mdev);
3985         return 1;
3986 }
3987
3988 #ifdef CONFIG_DRBD_FAULT_INJECTION
3989 /* Fault insertion support including random number generator shamelessly
3990  * stolen from kernel/rcutorture.c */
3991 struct fault_random_state {
3992         unsigned long state;
3993         unsigned long count;
3994 };
3995
3996 #define FAULT_RANDOM_MULT 39916801  /* prime */
3997 #define FAULT_RANDOM_ADD        479001701 /* prime */
3998 #define FAULT_RANDOM_REFRESH 10000
3999
4000 /*
4001  * Crude but fast random-number generator.  Uses a linear congruential
4002  * generator, with occasional help from get_random_bytes().
4003  */
4004 static unsigned long
4005 _drbd_fault_random(struct fault_random_state *rsp)
4006 {
4007         long refresh;
4008
4009         if (!rsp->count--) {
4010                 get_random_bytes(&refresh, sizeof(refresh));
4011                 rsp->state += refresh;
4012                 rsp->count = FAULT_RANDOM_REFRESH;
4013         }
4014         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4015         return swahw32(rsp->state);
4016 }
4017
4018 static char *
4019 _drbd_fault_str(unsigned int type) {
4020         static char *_faults[] = {
4021                 [DRBD_FAULT_MD_WR] = "Meta-data write",
4022                 [DRBD_FAULT_MD_RD] = "Meta-data read",
4023                 [DRBD_FAULT_RS_WR] = "Resync write",
4024                 [DRBD_FAULT_RS_RD] = "Resync read",
4025                 [DRBD_FAULT_DT_WR] = "Data write",
4026                 [DRBD_FAULT_DT_RD] = "Data read",
4027                 [DRBD_FAULT_DT_RA] = "Data read ahead",
4028                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4029                 [DRBD_FAULT_AL_EE] = "EE allocation",
4030                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4031         };
4032
4033         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4034 }
4035
4036 unsigned int
4037 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4038 {
4039         static struct fault_random_state rrs = {0, 0};
4040
4041         unsigned int ret = (
4042                 (fault_devs == 0 ||
4043                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4044                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4045
4046         if (ret) {
4047                 fault_count++;
4048
4049                 if (__ratelimit(&drbd_ratelimit_state))
4050                         dev_warn(DEV, "***Simulating %s failure\n",
4051                                 _drbd_fault_str(type));
4052         }
4053
4054         return ret;
4055 }
4056 #endif
4057
4058 const char *drbd_buildtag(void)
4059 {
4060         /* DRBD built from external sources has here a reference to the
4061            git hash of the source code. */
4062
4063         static char buildtag[38] = "\0uilt-in";
4064
4065         if (buildtag[0] == 0) {
4066 #ifdef CONFIG_MODULES
4067                 if (THIS_MODULE != NULL)
4068                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4069                 else
4070 #endif
4071                         buildtag[0] = 'b';
4072         }
4073
4074         return buildtag;
4075 }
4076
4077 module_init(drbd_init)
4078 module_exit(drbd_cleanup)
4079
4080 EXPORT_SYMBOL(drbd_conn_str);
4081 EXPORT_SYMBOL(drbd_role_str);
4082 EXPORT_SYMBOL(drbd_disk_str);
4083 EXPORT_SYMBOL(drbd_set_st_err_str);