drbd: Rename drbd_make_request_26 to drbd_make_request
[pandora-kernel.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91 #include <linux/moduleparam.h>
92 /* allow_open_on_secondary */
93 MODULE_PARM_DESC(allow_oos, "DONT USE!");
94 /* thanks to these macros, if compiled into the kernel (not-module),
95  * this becomes the boot parameter drbd.minor_count */
96 module_param(minor_count, uint, 0444);
97 module_param(disable_sendpage, bool, 0644);
98 module_param(allow_oos, bool, 0);
99 module_param(cn_idx, uint, 0444);
100 module_param(proc_details, int, 0644);
101
102 #ifdef CONFIG_DRBD_FAULT_INJECTION
103 int enable_faults;
104 int fault_rate;
105 static int fault_count;
106 int fault_devs;
107 /* bitmap of enabled faults */
108 module_param(enable_faults, int, 0664);
109 /* fault rate % value - applies to all enabled faults */
110 module_param(fault_rate, int, 0664);
111 /* count of faults inserted */
112 module_param(fault_count, int, 0664);
113 /* bitmap of devices to insert faults on */
114 module_param(fault_devs, int, 0644);
115 #endif
116
117 /* module parameter, defined */
118 unsigned int minor_count = 32;
119 int disable_sendpage;
120 int allow_oos;
121 unsigned int cn_idx = CN_IDX_DRBD;
122 int proc_details;       /* Detail level in proc drbd*/
123
124 /* Module parameter for setting the user mode helper program
125  * to run. Default is /sbin/drbdadm */
126 char usermode_helper[80] = "/sbin/drbdadm";
127
128 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
131  * as member "struct gendisk *vdisk;"
132  */
133 struct drbd_conf **minor_table;
134
135 struct kmem_cache *drbd_request_cache;
136 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
137 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
138 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
139 mempool_t *drbd_request_mempool;
140 mempool_t *drbd_ee_mempool;
141
142 /* I do not use a standard mempool, because:
143    1) I want to hand out the pre-allocated objects first.
144    2) I want to be able to interrupt sleeping allocation with a signal.
145    Note: This is a single linked list, the next pointer is the private
146          member of struct page.
147  */
148 struct page *drbd_pp_pool;
149 spinlock_t   drbd_pp_lock;
150 int          drbd_pp_vacant;
151 wait_queue_head_t drbd_pp_wait;
152
153 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
155 static const struct block_device_operations drbd_ops = {
156         .owner =   THIS_MODULE,
157         .open =    drbd_open,
158         .release = drbd_release,
159 };
160
161 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163 #ifdef __CHECKER__
164 /* When checking with sparse, and this is an inline function, sparse will
165    give tons of false positives. When this is a real functions sparse works.
166  */
167 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168 {
169         int io_allowed;
170
171         atomic_inc(&mdev->local_cnt);
172         io_allowed = (mdev->state.disk >= mins);
173         if (!io_allowed) {
174                 if (atomic_dec_and_test(&mdev->local_cnt))
175                         wake_up(&mdev->misc_wait);
176         }
177         return io_allowed;
178 }
179
180 #endif
181
182 /**
183  * DOC: The transfer log
184  *
185  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187  * of the list. There is always at least one &struct drbd_tl_epoch object.
188  *
189  * Each &struct drbd_tl_epoch has a circular double linked list of requests
190  * attached.
191  */
192 static int tl_init(struct drbd_conf *mdev)
193 {
194         struct drbd_tl_epoch *b;
195
196         /* during device minor initialization, we may well use GFP_KERNEL */
197         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198         if (!b)
199                 return 0;
200         INIT_LIST_HEAD(&b->requests);
201         INIT_LIST_HEAD(&b->w.list);
202         b->next = NULL;
203         b->br_number = 4711;
204         b->n_writes = 0;
205         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207         mdev->oldest_tle = b;
208         mdev->newest_tle = b;
209         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211         mdev->tl_hash = NULL;
212         mdev->tl_hash_s = 0;
213
214         return 1;
215 }
216
217 static void tl_cleanup(struct drbd_conf *mdev)
218 {
219         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221         kfree(mdev->oldest_tle);
222         mdev->oldest_tle = NULL;
223         kfree(mdev->unused_spare_tle);
224         mdev->unused_spare_tle = NULL;
225         kfree(mdev->tl_hash);
226         mdev->tl_hash = NULL;
227         mdev->tl_hash_s = 0;
228 }
229
230 /**
231  * _tl_add_barrier() - Adds a barrier to the transfer log
232  * @mdev:       DRBD device.
233  * @new:        Barrier to be added before the current head of the TL.
234  *
235  * The caller must hold the req_lock.
236  */
237 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238 {
239         struct drbd_tl_epoch *newest_before;
240
241         INIT_LIST_HEAD(&new->requests);
242         INIT_LIST_HEAD(&new->w.list);
243         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244         new->next = NULL;
245         new->n_writes = 0;
246
247         newest_before = mdev->newest_tle;
248         /* never send a barrier number == 0, because that is special-cased
249          * when using TCQ for our write ordering code */
250         new->br_number = (newest_before->br_number+1) ?: 1;
251         if (mdev->newest_tle != new) {
252                 mdev->newest_tle->next = new;
253                 mdev->newest_tle = new;
254         }
255 }
256
257 /**
258  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259  * @mdev:       DRBD device.
260  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261  * @set_size:   Expected number of requests before that barrier.
262  *
263  * In case the passed barrier_nr or set_size does not match the oldest
264  * &struct drbd_tl_epoch objects this function will cause a termination
265  * of the connection.
266  */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268                        unsigned int set_size)
269 {
270         struct drbd_tl_epoch *b, *nob; /* next old barrier */
271         struct list_head *le, *tle;
272         struct drbd_request *r;
273
274         spin_lock_irq(&mdev->req_lock);
275
276         b = mdev->oldest_tle;
277
278         /* first some paranoia code */
279         if (b == NULL) {
280                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281                         barrier_nr);
282                 goto bail;
283         }
284         if (b->br_number != barrier_nr) {
285                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286                         barrier_nr, b->br_number);
287                 goto bail;
288         }
289         if (b->n_writes != set_size) {
290                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291                         barrier_nr, set_size, b->n_writes);
292                 goto bail;
293         }
294
295         /* Clean up list of requests processed during current epoch */
296         list_for_each_safe(le, tle, &b->requests) {
297                 r = list_entry(le, struct drbd_request, tl_requests);
298                 _req_mod(r, barrier_acked);
299         }
300         /* There could be requests on the list waiting for completion
301            of the write to the local disk. To avoid corruptions of
302            slab's data structures we have to remove the lists head.
303
304            Also there could have been a barrier ack out of sequence, overtaking
305            the write acks - which would be a bug and violating write ordering.
306            To not deadlock in case we lose connection while such requests are
307            still pending, we need some way to find them for the
308            _req_mode(connection_lost_while_pending).
309
310            These have been list_move'd to the out_of_sequence_requests list in
311            _req_mod(, barrier_acked) above.
312            */
313         list_del_init(&b->requests);
314
315         nob = b->next;
316         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317                 _tl_add_barrier(mdev, b);
318                 if (nob)
319                         mdev->oldest_tle = nob;
320                 /* if nob == NULL b was the only barrier, and becomes the new
321                    barrier. Therefore mdev->oldest_tle points already to b */
322         } else {
323                 D_ASSERT(nob != NULL);
324                 mdev->oldest_tle = nob;
325                 kfree(b);
326         }
327
328         spin_unlock_irq(&mdev->req_lock);
329         dec_ap_pending(mdev);
330
331         return;
332
333 bail:
334         spin_unlock_irq(&mdev->req_lock);
335         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338 /**
339  * _tl_restart() - Walks the transfer log, and applies an action to all requests
340  * @mdev:       DRBD device.
341  * @what:       The action/event to perform with all request objects
342  *
343  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344  * restart_frozen_disk_io.
345  */
346 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347 {
348         struct drbd_tl_epoch *b, *tmp, **pn;
349         struct list_head *le, *tle, carry_reads;
350         struct drbd_request *req;
351         int rv, n_writes, n_reads;
352
353         b = mdev->oldest_tle;
354         pn = &mdev->oldest_tle;
355         while (b) {
356                 n_writes = 0;
357                 n_reads = 0;
358                 INIT_LIST_HEAD(&carry_reads);
359                 list_for_each_safe(le, tle, &b->requests) {
360                         req = list_entry(le, struct drbd_request, tl_requests);
361                         rv = _req_mod(req, what);
362
363                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
365                 }
366                 tmp = b->next;
367
368                 if (n_writes) {
369                         if (what == resend) {
370                                 b->n_writes = n_writes;
371                                 if (b->w.cb == NULL) {
372                                         b->w.cb = w_send_barrier;
373                                         inc_ap_pending(mdev);
374                                         set_bit(CREATE_BARRIER, &mdev->flags);
375                                 }
376
377                                 drbd_queue_work(&mdev->data.work, &b->w);
378                         }
379                         pn = &b->next;
380                 } else {
381                         if (n_reads)
382                                 list_add(&carry_reads, &b->requests);
383                         /* there could still be requests on that ring list,
384                          * in case local io is still pending */
385                         list_del(&b->requests);
386
387                         /* dec_ap_pending corresponding to queue_barrier.
388                          * the newest barrier may not have been queued yet,
389                          * in which case w.cb is still NULL. */
390                         if (b->w.cb != NULL)
391                                 dec_ap_pending(mdev);
392
393                         if (b == mdev->newest_tle) {
394                                 /* recycle, but reinit! */
395                                 D_ASSERT(tmp == NULL);
396                                 INIT_LIST_HEAD(&b->requests);
397                                 list_splice(&carry_reads, &b->requests);
398                                 INIT_LIST_HEAD(&b->w.list);
399                                 b->w.cb = NULL;
400                                 b->br_number = net_random();
401                                 b->n_writes = 0;
402
403                                 *pn = b;
404                                 break;
405                         }
406                         *pn = tmp;
407                         kfree(b);
408                 }
409                 b = tmp;
410                 list_splice(&carry_reads, &b->requests);
411         }
412 }
413
414
415 /**
416  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417  * @mdev:       DRBD device.
418  *
419  * This is called after the connection to the peer was lost. The storage covered
420  * by the requests on the transfer gets marked as our of sync. Called from the
421  * receiver thread and the worker thread.
422  */
423 void tl_clear(struct drbd_conf *mdev)
424 {
425         struct list_head *le, *tle;
426         struct drbd_request *r;
427
428         spin_lock_irq(&mdev->req_lock);
429
430         _tl_restart(mdev, connection_lost_while_pending);
431
432         /* we expect this list to be empty. */
433         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435         /* but just in case, clean it up anyways! */
436         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437                 r = list_entry(le, struct drbd_request, tl_requests);
438                 /* It would be nice to complete outside of spinlock.
439                  * But this is easier for now. */
440                 _req_mod(r, connection_lost_while_pending);
441         }
442
443         /* ensure bit indicating barrier is required is clear */
444         clear_bit(CREATE_BARRIER, &mdev->flags);
445
446         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448         spin_unlock_irq(&mdev->req_lock);
449 }
450
451 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452 {
453         spin_lock_irq(&mdev->req_lock);
454         _tl_restart(mdev, what);
455         spin_unlock_irq(&mdev->req_lock);
456 }
457
458 /**
459  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460  * @mdev:       DRBD device.
461  * @os:         old (current) state.
462  * @ns:         new (wanted) state.
463  */
464 static int cl_wide_st_chg(struct drbd_conf *mdev,
465                           union drbd_state os, union drbd_state ns)
466 {
467         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474 }
475
476 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477                       union drbd_state mask, union drbd_state val)
478 {
479         unsigned long flags;
480         union drbd_state os, ns;
481         int rv;
482
483         spin_lock_irqsave(&mdev->req_lock, flags);
484         os = mdev->state;
485         ns.i = (os.i & ~mask.i) | val.i;
486         rv = _drbd_set_state(mdev, ns, f, NULL);
487         ns = mdev->state;
488         spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490         return rv;
491 }
492
493 /**
494  * drbd_force_state() - Impose a change which happens outside our control on our state
495  * @mdev:       DRBD device.
496  * @mask:       mask of state bits to change.
497  * @val:        value of new state bits.
498  */
499 void drbd_force_state(struct drbd_conf *mdev,
500         union drbd_state mask, union drbd_state val)
501 {
502         drbd_change_state(mdev, CS_HARD, mask, val);
503 }
504
505 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506 static int is_valid_state_transition(struct drbd_conf *,
507                                      union drbd_state, union drbd_state);
508 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
509                                        union drbd_state ns, const char **warn_sync_abort);
510 int drbd_send_state_req(struct drbd_conf *,
511                         union drbd_state, union drbd_state);
512
513 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514                                     union drbd_state mask, union drbd_state val)
515 {
516         union drbd_state os, ns;
517         unsigned long flags;
518         int rv;
519
520         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521                 return SS_CW_SUCCESS;
522
523         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524                 return SS_CW_FAILED_BY_PEER;
525
526         rv = 0;
527         spin_lock_irqsave(&mdev->req_lock, flags);
528         os = mdev->state;
529         ns.i = (os.i & ~mask.i) | val.i;
530         ns = sanitize_state(mdev, os, ns, NULL);
531
532         if (!cl_wide_st_chg(mdev, os, ns))
533                 rv = SS_CW_NO_NEED;
534         if (!rv) {
535                 rv = is_valid_state(mdev, ns);
536                 if (rv == SS_SUCCESS) {
537                         rv = is_valid_state_transition(mdev, ns, os);
538                         if (rv == SS_SUCCESS)
539                                 rv = 0; /* cont waiting, otherwise fail. */
540                 }
541         }
542         spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544         return rv;
545 }
546
547 /**
548  * drbd_req_state() - Perform an eventually cluster wide state change
549  * @mdev:       DRBD device.
550  * @mask:       mask of state bits to change.
551  * @val:        value of new state bits.
552  * @f:          flags
553  *
554  * Should not be called directly, use drbd_request_state() or
555  * _drbd_request_state().
556  */
557 static int drbd_req_state(struct drbd_conf *mdev,
558                           union drbd_state mask, union drbd_state val,
559                           enum chg_state_flags f)
560 {
561         struct completion done;
562         unsigned long flags;
563         union drbd_state os, ns;
564         int rv;
565
566         init_completion(&done);
567
568         if (f & CS_SERIALIZE)
569                 mutex_lock(&mdev->state_mutex);
570
571         spin_lock_irqsave(&mdev->req_lock, flags);
572         os = mdev->state;
573         ns.i = (os.i & ~mask.i) | val.i;
574         ns = sanitize_state(mdev, os, ns, NULL);
575
576         if (cl_wide_st_chg(mdev, os, ns)) {
577                 rv = is_valid_state(mdev, ns);
578                 if (rv == SS_SUCCESS)
579                         rv = is_valid_state_transition(mdev, ns, os);
580                 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582                 if (rv < SS_SUCCESS) {
583                         if (f & CS_VERBOSE)
584                                 print_st_err(mdev, os, ns, rv);
585                         goto abort;
586                 }
587
588                 drbd_state_lock(mdev);
589                 if (!drbd_send_state_req(mdev, mask, val)) {
590                         drbd_state_unlock(mdev);
591                         rv = SS_CW_FAILED_BY_PEER;
592                         if (f & CS_VERBOSE)
593                                 print_st_err(mdev, os, ns, rv);
594                         goto abort;
595                 }
596
597                 wait_event(mdev->state_wait,
598                         (rv = _req_st_cond(mdev, mask, val)));
599
600                 if (rv < SS_SUCCESS) {
601                         drbd_state_unlock(mdev);
602                         if (f & CS_VERBOSE)
603                                 print_st_err(mdev, os, ns, rv);
604                         goto abort;
605                 }
606                 spin_lock_irqsave(&mdev->req_lock, flags);
607                 os = mdev->state;
608                 ns.i = (os.i & ~mask.i) | val.i;
609                 rv = _drbd_set_state(mdev, ns, f, &done);
610                 drbd_state_unlock(mdev);
611         } else {
612                 rv = _drbd_set_state(mdev, ns, f, &done);
613         }
614
615         spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618                 D_ASSERT(current != mdev->worker.task);
619                 wait_for_completion(&done);
620         }
621
622 abort:
623         if (f & CS_SERIALIZE)
624                 mutex_unlock(&mdev->state_mutex);
625
626         return rv;
627 }
628
629 /**
630  * _drbd_request_state() - Request a state change (with flags)
631  * @mdev:       DRBD device.
632  * @mask:       mask of state bits to change.
633  * @val:        value of new state bits.
634  * @f:          flags
635  *
636  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637  * flag, or when logging of failed state change requests is not desired.
638  */
639 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640                         union drbd_state val,   enum chg_state_flags f)
641 {
642         int rv;
643
644         wait_event(mdev->state_wait,
645                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647         return rv;
648 }
649
650 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651 {
652         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653             name,
654             drbd_conn_str(ns.conn),
655             drbd_role_str(ns.role),
656             drbd_role_str(ns.peer),
657             drbd_disk_str(ns.disk),
658             drbd_disk_str(ns.pdsk),
659             is_susp(ns) ? 's' : 'r',
660             ns.aftr_isp ? 'a' : '-',
661             ns.peer_isp ? 'p' : '-',
662             ns.user_isp ? 'u' : '-'
663             );
664 }
665
666 void print_st_err(struct drbd_conf *mdev,
667         union drbd_state os, union drbd_state ns, int err)
668 {
669         if (err == SS_IN_TRANSIENT_STATE)
670                 return;
671         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672         print_st(mdev, " state", os);
673         print_st(mdev, "wanted", ns);
674 }
675
676
677 #define drbd_peer_str drbd_role_str
678 #define drbd_pdsk_str drbd_disk_str
679
680 #define drbd_susp_str(A)     ((A) ? "1" : "0")
681 #define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
682 #define drbd_peer_isp_str(A) ((A) ? "1" : "0")
683 #define drbd_user_isp_str(A) ((A) ? "1" : "0")
684
685 #define PSC(A) \
686         ({ if (ns.A != os.A) { \
687                 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
688                               drbd_##A##_str(os.A), \
689                               drbd_##A##_str(ns.A)); \
690         } })
691
692 /**
693  * is_valid_state() - Returns an SS_ error code if ns is not valid
694  * @mdev:       DRBD device.
695  * @ns:         State to consider.
696  */
697 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
698 {
699         /* See drbd_state_sw_errors in drbd_strings.c */
700
701         enum drbd_fencing_p fp;
702         int rv = SS_SUCCESS;
703
704         fp = FP_DONT_CARE;
705         if (get_ldev(mdev)) {
706                 fp = mdev->ldev->dc.fencing;
707                 put_ldev(mdev);
708         }
709
710         if (get_net_conf(mdev)) {
711                 if (!mdev->net_conf->two_primaries &&
712                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713                         rv = SS_TWO_PRIMARIES;
714                 put_net_conf(mdev);
715         }
716
717         if (rv <= 0)
718                 /* already found a reason to abort */;
719         else if (ns.role == R_SECONDARY && mdev->open_cnt)
720                 rv = SS_DEVICE_IN_USE;
721
722         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723                 rv = SS_NO_UP_TO_DATE_DISK;
724
725         else if (fp >= FP_RESOURCE &&
726                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727                 rv = SS_PRIMARY_NOP;
728
729         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730                 rv = SS_NO_UP_TO_DATE_DISK;
731
732         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733                 rv = SS_NO_LOCAL_DISK;
734
735         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736                 rv = SS_NO_REMOTE_DISK;
737
738         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739                 rv = SS_NO_UP_TO_DATE_DISK;
740
741         else if ((ns.conn == C_CONNECTED ||
742                   ns.conn == C_WF_BITMAP_S ||
743                   ns.conn == C_SYNC_SOURCE ||
744                   ns.conn == C_PAUSED_SYNC_S) &&
745                   ns.disk == D_OUTDATED)
746                 rv = SS_CONNECTED_OUTDATES;
747
748         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749                  (mdev->sync_conf.verify_alg[0] == 0))
750                 rv = SS_NO_VERIFY_ALG;
751
752         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753                   mdev->agreed_pro_version < 88)
754                 rv = SS_NOT_SUPPORTED;
755
756         return rv;
757 }
758
759 /**
760  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
761  * @mdev:       DRBD device.
762  * @ns:         new state.
763  * @os:         old state.
764  */
765 static int is_valid_state_transition(struct drbd_conf *mdev,
766                                      union drbd_state ns, union drbd_state os)
767 {
768         int rv = SS_SUCCESS;
769
770         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
771             os.conn > C_CONNECTED)
772                 rv = SS_RESYNC_RUNNING;
773
774         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
775                 rv = SS_ALREADY_STANDALONE;
776
777         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
778                 rv = SS_IS_DISKLESS;
779
780         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
781                 rv = SS_NO_NET_CONFIG;
782
783         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
784                 rv = SS_LOWER_THAN_OUTDATED;
785
786         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
787                 rv = SS_IN_TRANSIENT_STATE;
788
789         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
790                 rv = SS_IN_TRANSIENT_STATE;
791
792         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
793                 rv = SS_NEED_CONNECTION;
794
795         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
796             ns.conn != os.conn && os.conn > C_CONNECTED)
797                 rv = SS_RESYNC_RUNNING;
798
799         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
800             os.conn < C_CONNECTED)
801                 rv = SS_NEED_CONNECTION;
802
803         if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
804             && os.conn < C_WF_REPORT_PARAMS)
805                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
806
807         return rv;
808 }
809
810 /**
811  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
812  * @mdev:       DRBD device.
813  * @os:         old state.
814  * @ns:         new state.
815  * @warn_sync_abort:
816  *
817  * When we loose connection, we have to set the state of the peers disk (pdsk)
818  * to D_UNKNOWN. This rule and many more along those lines are in this function.
819  */
820 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
821                                        union drbd_state ns, const char **warn_sync_abort)
822 {
823         enum drbd_fencing_p fp;
824         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
825
826         fp = FP_DONT_CARE;
827         if (get_ldev(mdev)) {
828                 fp = mdev->ldev->dc.fencing;
829                 put_ldev(mdev);
830         }
831
832         /* Disallow Network errors to configure a device's network part */
833         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
834             os.conn <= C_DISCONNECTING)
835                 ns.conn = os.conn;
836
837         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
838          * If you try to go into some Sync* state, that shall fail (elsewhere). */
839         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
840             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
841                 ns.conn = os.conn;
842
843         /* we cannot fail (again) if we already detached */
844         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
845                 ns.disk = D_DISKLESS;
846
847         /* if we are only D_ATTACHING yet,
848          * we can (and should) go directly to D_DISKLESS. */
849         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
850                 ns.disk = D_DISKLESS;
851
852         /* After C_DISCONNECTING only C_STANDALONE may follow */
853         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
854                 ns.conn = os.conn;
855
856         if (ns.conn < C_CONNECTED) {
857                 ns.peer_isp = 0;
858                 ns.peer = R_UNKNOWN;
859                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
860                         ns.pdsk = D_UNKNOWN;
861         }
862
863         /* Clear the aftr_isp when becoming unconfigured */
864         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
865                 ns.aftr_isp = 0;
866
867         /* Abort resync if a disk fails/detaches */
868         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
869             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
870                 if (warn_sync_abort)
871                         *warn_sync_abort =
872                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
873                                 "Online-verify" : "Resync";
874                 ns.conn = C_CONNECTED;
875         }
876
877         /* Connection breaks down before we finished "Negotiating" */
878         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
879             get_ldev_if_state(mdev, D_NEGOTIATING)) {
880                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
881                         ns.disk = mdev->new_state_tmp.disk;
882                         ns.pdsk = mdev->new_state_tmp.pdsk;
883                 } else {
884                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
885                         ns.disk = D_DISKLESS;
886                         ns.pdsk = D_UNKNOWN;
887                 }
888                 put_ldev(mdev);
889         }
890
891         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
892         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
893                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
894                         ns.disk = D_UP_TO_DATE;
895                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
896                         ns.pdsk = D_UP_TO_DATE;
897         }
898
899         /* Implications of the connection stat on the disk states */
900         disk_min = D_DISKLESS;
901         disk_max = D_UP_TO_DATE;
902         pdsk_min = D_INCONSISTENT;
903         pdsk_max = D_UNKNOWN;
904         switch ((enum drbd_conns)ns.conn) {
905         case C_WF_BITMAP_T:
906         case C_PAUSED_SYNC_T:
907         case C_STARTING_SYNC_T:
908         case C_WF_SYNC_UUID:
909         case C_BEHIND:
910                 disk_min = D_INCONSISTENT;
911                 disk_max = D_OUTDATED;
912                 pdsk_min = D_UP_TO_DATE;
913                 pdsk_max = D_UP_TO_DATE;
914                 break;
915         case C_VERIFY_S:
916         case C_VERIFY_T:
917                 disk_min = D_UP_TO_DATE;
918                 disk_max = D_UP_TO_DATE;
919                 pdsk_min = D_UP_TO_DATE;
920                 pdsk_max = D_UP_TO_DATE;
921                 break;
922         case C_CONNECTED:
923                 disk_min = D_DISKLESS;
924                 disk_max = D_UP_TO_DATE;
925                 pdsk_min = D_DISKLESS;
926                 pdsk_max = D_UP_TO_DATE;
927                 break;
928         case C_WF_BITMAP_S:
929         case C_PAUSED_SYNC_S:
930         case C_STARTING_SYNC_S:
931         case C_AHEAD:
932                 disk_min = D_UP_TO_DATE;
933                 disk_max = D_UP_TO_DATE;
934                 pdsk_min = D_INCONSISTENT;
935                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
936                 break;
937         case C_SYNC_TARGET:
938                 disk_min = D_INCONSISTENT;
939                 disk_max = D_INCONSISTENT;
940                 pdsk_min = D_UP_TO_DATE;
941                 pdsk_max = D_UP_TO_DATE;
942                 break;
943         case C_SYNC_SOURCE:
944                 disk_min = D_UP_TO_DATE;
945                 disk_max = D_UP_TO_DATE;
946                 pdsk_min = D_INCONSISTENT;
947                 pdsk_max = D_INCONSISTENT;
948                 break;
949         case C_STANDALONE:
950         case C_DISCONNECTING:
951         case C_UNCONNECTED:
952         case C_TIMEOUT:
953         case C_BROKEN_PIPE:
954         case C_NETWORK_FAILURE:
955         case C_PROTOCOL_ERROR:
956         case C_TEAR_DOWN:
957         case C_WF_CONNECTION:
958         case C_WF_REPORT_PARAMS:
959         case C_MASK:
960                 break;
961         }
962         if (ns.disk > disk_max)
963                 ns.disk = disk_max;
964
965         if (ns.disk < disk_min) {
966                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
967                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
968                 ns.disk = disk_min;
969         }
970         if (ns.pdsk > pdsk_max)
971                 ns.pdsk = pdsk_max;
972
973         if (ns.pdsk < pdsk_min) {
974                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
975                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
976                 ns.pdsk = pdsk_min;
977         }
978
979         if (fp == FP_STONITH &&
980             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
981             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
982                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
983
984         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
985             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
986             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
987                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
988
989         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
990                 if (ns.conn == C_SYNC_SOURCE)
991                         ns.conn = C_PAUSED_SYNC_S;
992                 if (ns.conn == C_SYNC_TARGET)
993                         ns.conn = C_PAUSED_SYNC_T;
994         } else {
995                 if (ns.conn == C_PAUSED_SYNC_S)
996                         ns.conn = C_SYNC_SOURCE;
997                 if (ns.conn == C_PAUSED_SYNC_T)
998                         ns.conn = C_SYNC_TARGET;
999         }
1000
1001         return ns;
1002 }
1003
1004 /* helper for __drbd_set_state */
1005 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1006 {
1007         if (mdev->agreed_pro_version < 90)
1008                 mdev->ov_start_sector = 0;
1009         mdev->rs_total = drbd_bm_bits(mdev);
1010         mdev->ov_position = 0;
1011         if (cs == C_VERIFY_T) {
1012                 /* starting online verify from an arbitrary position
1013                  * does not fit well into the existing protocol.
1014                  * on C_VERIFY_T, we initialize ov_left and friends
1015                  * implicitly in receive_DataRequest once the
1016                  * first P_OV_REQUEST is received */
1017                 mdev->ov_start_sector = ~(sector_t)0;
1018         } else {
1019                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1020                 if (bit >= mdev->rs_total) {
1021                         mdev->ov_start_sector =
1022                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1023                         mdev->rs_total = 1;
1024                 } else
1025                         mdev->rs_total -= bit;
1026                 mdev->ov_position = mdev->ov_start_sector;
1027         }
1028         mdev->ov_left = mdev->rs_total;
1029 }
1030
1031 static void drbd_resume_al(struct drbd_conf *mdev)
1032 {
1033         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1034                 dev_info(DEV, "Resumed AL updates\n");
1035 }
1036
1037 /**
1038  * __drbd_set_state() - Set a new DRBD state
1039  * @mdev:       DRBD device.
1040  * @ns:         new state.
1041  * @flags:      Flags
1042  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1043  *
1044  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1045  */
1046 int __drbd_set_state(struct drbd_conf *mdev,
1047                     union drbd_state ns, enum chg_state_flags flags,
1048                     struct completion *done)
1049 {
1050         union drbd_state os;
1051         int rv = SS_SUCCESS;
1052         const char *warn_sync_abort = NULL;
1053         struct after_state_chg_work *ascw;
1054
1055         os = mdev->state;
1056
1057         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1058
1059         if (ns.i == os.i)
1060                 return SS_NOTHING_TO_DO;
1061
1062         if (!(flags & CS_HARD)) {
1063                 /*  pre-state-change checks ; only look at ns  */
1064                 /* See drbd_state_sw_errors in drbd_strings.c */
1065
1066                 rv = is_valid_state(mdev, ns);
1067                 if (rv < SS_SUCCESS) {
1068                         /* If the old state was illegal as well, then let
1069                            this happen...*/
1070
1071                         if (is_valid_state(mdev, os) == rv)
1072                                 rv = is_valid_state_transition(mdev, ns, os);
1073                 } else
1074                         rv = is_valid_state_transition(mdev, ns, os);
1075         }
1076
1077         if (rv < SS_SUCCESS) {
1078                 if (flags & CS_VERBOSE)
1079                         print_st_err(mdev, os, ns, rv);
1080                 return rv;
1081         }
1082
1083         if (warn_sync_abort)
1084                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1085
1086         {
1087                 char *pbp, pb[300];
1088                 pbp = pb;
1089                 *pbp = 0;
1090                 PSC(role);
1091                 PSC(peer);
1092                 PSC(conn);
1093                 PSC(disk);
1094                 PSC(pdsk);
1095                 if (is_susp(ns) != is_susp(os))
1096                         pbp += sprintf(pbp, "susp( %s -> %s ) ",
1097                                        drbd_susp_str(is_susp(os)),
1098                                        drbd_susp_str(is_susp(ns)));
1099                 PSC(aftr_isp);
1100                 PSC(peer_isp);
1101                 PSC(user_isp);
1102                 dev_info(DEV, "%s\n", pb);
1103         }
1104
1105         /* solve the race between becoming unconfigured,
1106          * worker doing the cleanup, and
1107          * admin reconfiguring us:
1108          * on (re)configure, first set CONFIG_PENDING,
1109          * then wait for a potentially exiting worker,
1110          * start the worker, and schedule one no_op.
1111          * then proceed with configuration.
1112          */
1113         if (ns.disk == D_DISKLESS &&
1114             ns.conn == C_STANDALONE &&
1115             ns.role == R_SECONDARY &&
1116             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1117                 set_bit(DEVICE_DYING, &mdev->flags);
1118
1119         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1120          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1121          * drbd_ldev_destroy() won't happen before our corresponding
1122          * after_state_ch works run, where we put_ldev again. */
1123         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1124             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1125                 atomic_inc(&mdev->local_cnt);
1126
1127         mdev->state = ns;
1128         wake_up(&mdev->misc_wait);
1129         wake_up(&mdev->state_wait);
1130
1131         /* aborted verify run. log the last position */
1132         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1133             ns.conn < C_CONNECTED) {
1134                 mdev->ov_start_sector =
1135                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1136                 dev_info(DEV, "Online Verify reached sector %llu\n",
1137                         (unsigned long long)mdev->ov_start_sector);
1138         }
1139
1140         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1141             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1142                 dev_info(DEV, "Syncer continues.\n");
1143                 mdev->rs_paused += (long)jiffies
1144                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1145                 if (ns.conn == C_SYNC_TARGET)
1146                         mod_timer(&mdev->resync_timer, jiffies);
1147         }
1148
1149         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1150             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1151                 dev_info(DEV, "Resync suspended\n");
1152                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1153         }
1154
1155         if (os.conn == C_CONNECTED &&
1156             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1157                 unsigned long now = jiffies;
1158                 int i;
1159
1160                 set_ov_position(mdev, ns.conn);
1161                 mdev->rs_start = now;
1162                 mdev->rs_last_events = 0;
1163                 mdev->rs_last_sect_ev = 0;
1164                 mdev->ov_last_oos_size = 0;
1165                 mdev->ov_last_oos_start = 0;
1166
1167                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1168                         mdev->rs_mark_left[i] = mdev->ov_left;
1169                         mdev->rs_mark_time[i] = now;
1170                 }
1171
1172                 drbd_rs_controller_reset(mdev);
1173
1174                 if (ns.conn == C_VERIFY_S) {
1175                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1176                                         (unsigned long long)mdev->ov_position);
1177                         mod_timer(&mdev->resync_timer, jiffies);
1178                 }
1179         }
1180
1181         if (get_ldev(mdev)) {
1182                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1183                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1184                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1185
1186                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1187                         mdf |= MDF_CRASHED_PRIMARY;
1188                 if (mdev->state.role == R_PRIMARY ||
1189                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1190                         mdf |= MDF_PRIMARY_IND;
1191                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1192                         mdf |= MDF_CONNECTED_IND;
1193                 if (mdev->state.disk > D_INCONSISTENT)
1194                         mdf |= MDF_CONSISTENT;
1195                 if (mdev->state.disk > D_OUTDATED)
1196                         mdf |= MDF_WAS_UP_TO_DATE;
1197                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1198                         mdf |= MDF_PEER_OUT_DATED;
1199                 if (mdf != mdev->ldev->md.flags) {
1200                         mdev->ldev->md.flags = mdf;
1201                         drbd_md_mark_dirty(mdev);
1202                 }
1203                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1204                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1205                 put_ldev(mdev);
1206         }
1207
1208         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1209         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1210             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1211                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1212
1213         /* Receiver should clean up itself */
1214         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1215                 drbd_thread_stop_nowait(&mdev->receiver);
1216
1217         /* Now the receiver finished cleaning up itself, it should die */
1218         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1219                 drbd_thread_stop_nowait(&mdev->receiver);
1220
1221         /* Upon network failure, we need to restart the receiver. */
1222         if (os.conn > C_TEAR_DOWN &&
1223             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1224                 drbd_thread_restart_nowait(&mdev->receiver);
1225
1226         /* Resume AL writing if we get a connection */
1227         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1228                 drbd_resume_al(mdev);
1229
1230         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1231         if (ascw) {
1232                 ascw->os = os;
1233                 ascw->ns = ns;
1234                 ascw->flags = flags;
1235                 ascw->w.cb = w_after_state_ch;
1236                 ascw->done = done;
1237                 drbd_queue_work(&mdev->data.work, &ascw->w);
1238         } else {
1239                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1240         }
1241
1242         return rv;
1243 }
1244
1245 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1246 {
1247         struct after_state_chg_work *ascw =
1248                 container_of(w, struct after_state_chg_work, w);
1249         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1250         if (ascw->flags & CS_WAIT_COMPLETE) {
1251                 D_ASSERT(ascw->done != NULL);
1252                 complete(ascw->done);
1253         }
1254         kfree(ascw);
1255
1256         return 1;
1257 }
1258
1259 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1260 {
1261         if (rv) {
1262                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1263                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1264                 return;
1265         }
1266
1267         switch (mdev->state.conn) {
1268         case C_STARTING_SYNC_T:
1269                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1270                 break;
1271         case C_STARTING_SYNC_S:
1272                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1273                 break;
1274         }
1275 }
1276
1277 /**
1278  * after_state_ch() - Perform after state change actions that may sleep
1279  * @mdev:       DRBD device.
1280  * @os:         old state.
1281  * @ns:         new state.
1282  * @flags:      Flags
1283  */
1284 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1285                            union drbd_state ns, enum chg_state_flags flags)
1286 {
1287         enum drbd_fencing_p fp;
1288         enum drbd_req_event what = nothing;
1289         union drbd_state nsm = (union drbd_state){ .i = -1 };
1290
1291         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1292                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1293                 if (mdev->p_uuid)
1294                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1295         }
1296
1297         fp = FP_DONT_CARE;
1298         if (get_ldev(mdev)) {
1299                 fp = mdev->ldev->dc.fencing;
1300                 put_ldev(mdev);
1301         }
1302
1303         /* Inform userspace about the change... */
1304         drbd_bcast_state(mdev, ns);
1305
1306         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1307             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1308                 drbd_khelper(mdev, "pri-on-incon-degr");
1309
1310         /* Here we have the actions that are performed after a
1311            state change. This function might sleep */
1312
1313         nsm.i = -1;
1314         if (ns.susp_nod) {
1315                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1316                         if (ns.conn == C_CONNECTED)
1317                                 what = resend, nsm.susp_nod = 0;
1318                         else /* ns.conn > C_CONNECTED */
1319                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1320                 }
1321
1322                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1323                         what = restart_frozen_disk_io, nsm.susp_nod = 0;
1324
1325         }
1326
1327         if (ns.susp_fen) {
1328                 /* case1: The outdate peer handler is successful: */
1329                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1330                         tl_clear(mdev);
1331                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1332                                 drbd_uuid_new_current(mdev);
1333                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1334                         }
1335                         spin_lock_irq(&mdev->req_lock);
1336                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1337                         spin_unlock_irq(&mdev->req_lock);
1338                 }
1339                 /* case2: The connection was established again: */
1340                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1341                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1342                         what = resend;
1343                         nsm.susp_fen = 0;
1344                 }
1345         }
1346
1347         if (what != nothing) {
1348                 spin_lock_irq(&mdev->req_lock);
1349                 _tl_restart(mdev, what);
1350                 nsm.i &= mdev->state.i;
1351                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1352                 spin_unlock_irq(&mdev->req_lock);
1353         }
1354
1355         /* Do not change the order of the if above and the two below... */
1356         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1357                 drbd_send_uuids(mdev);
1358                 drbd_send_state(mdev);
1359         }
1360         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1361                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1362
1363         /* Lost contact to peer's copy of the data */
1364         if ((os.pdsk >= D_INCONSISTENT &&
1365              os.pdsk != D_UNKNOWN &&
1366              os.pdsk != D_OUTDATED)
1367         &&  (ns.pdsk < D_INCONSISTENT ||
1368              ns.pdsk == D_UNKNOWN ||
1369              ns.pdsk == D_OUTDATED)) {
1370                 if (get_ldev(mdev)) {
1371                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1372                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1373                                 if (is_susp(mdev->state)) {
1374                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1375                                 } else {
1376                                         drbd_uuid_new_current(mdev);
1377                                         drbd_send_uuids(mdev);
1378                                 }
1379                         }
1380                         put_ldev(mdev);
1381                 }
1382         }
1383
1384         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1385                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1386                         drbd_uuid_new_current(mdev);
1387                         drbd_send_uuids(mdev);
1388                 }
1389
1390                 /* D_DISKLESS Peer becomes secondary */
1391                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1392                         drbd_al_to_on_disk_bm(mdev);
1393                 put_ldev(mdev);
1394         }
1395
1396         /* Last part of the attaching process ... */
1397         if (ns.conn >= C_CONNECTED &&
1398             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1399                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1400                 drbd_send_uuids(mdev);
1401                 drbd_send_state(mdev);
1402         }
1403
1404         /* We want to pause/continue resync, tell peer. */
1405         if (ns.conn >= C_CONNECTED &&
1406              ((os.aftr_isp != ns.aftr_isp) ||
1407               (os.user_isp != ns.user_isp)))
1408                 drbd_send_state(mdev);
1409
1410         /* In case one of the isp bits got set, suspend other devices. */
1411         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1412             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1413                 suspend_other_sg(mdev);
1414
1415         /* Make sure the peer gets informed about eventual state
1416            changes (ISP bits) while we were in WFReportParams. */
1417         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1418                 drbd_send_state(mdev);
1419
1420         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1421                 drbd_send_state(mdev);
1422
1423         /* We are in the progress to start a full sync... */
1424         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1425             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1426                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1427
1428         /* We are invalidating our self... */
1429         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1430             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1431                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1432
1433         /* first half of local IO error, failure to attach,
1434          * or administrative detach */
1435         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1436                 enum drbd_io_error_p eh;
1437                 int was_io_error;
1438                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1439                  * our cleanup here with the transition to D_DISKLESS,
1440                  * so it is safe to dreference ldev here. */
1441                 eh = mdev->ldev->dc.on_io_error;
1442                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1443
1444                 /* current state still has to be D_FAILED,
1445                  * there is only one way out: to D_DISKLESS,
1446                  * and that may only happen after our put_ldev below. */
1447                 if (mdev->state.disk != D_FAILED)
1448                         dev_err(DEV,
1449                                 "ASSERT FAILED: disk is %s during detach\n",
1450                                 drbd_disk_str(mdev->state.disk));
1451
1452                 if (drbd_send_state(mdev))
1453                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1454                 else
1455                         dev_err(DEV, "Sending state for detaching disk failed\n");
1456
1457                 drbd_rs_cancel_all(mdev);
1458
1459                 /* In case we want to get something to stable storage still,
1460                  * this may be the last chance.
1461                  * Following put_ldev may transition to D_DISKLESS. */
1462                 drbd_md_sync(mdev);
1463                 put_ldev(mdev);
1464
1465                 if (was_io_error && eh == EP_CALL_HELPER)
1466                         drbd_khelper(mdev, "local-io-error");
1467         }
1468
1469         /* second half of local IO error, failure to attach,
1470          * or administrative detach,
1471          * after local_cnt references have reached zero again */
1472         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1473                 /* We must still be diskless,
1474                  * re-attach has to be serialized with this! */
1475                 if (mdev->state.disk != D_DISKLESS)
1476                         dev_err(DEV,
1477                                 "ASSERT FAILED: disk is %s while going diskless\n",
1478                                 drbd_disk_str(mdev->state.disk));
1479
1480                 mdev->rs_total = 0;
1481                 mdev->rs_failed = 0;
1482                 atomic_set(&mdev->rs_pending_cnt, 0);
1483
1484                 if (drbd_send_state(mdev))
1485                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1486                 else
1487                         dev_err(DEV, "Sending state for being diskless failed\n");
1488                 /* corresponding get_ldev in __drbd_set_state
1489                  * this may finaly trigger drbd_ldev_destroy. */
1490                 put_ldev(mdev);
1491         }
1492
1493         /* Disks got bigger while they were detached */
1494         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1495             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1496                 if (ns.conn == C_CONNECTED)
1497                         resync_after_online_grow(mdev);
1498         }
1499
1500         /* A resync finished or aborted, wake paused devices... */
1501         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1502             (os.peer_isp && !ns.peer_isp) ||
1503             (os.user_isp && !ns.user_isp))
1504                 resume_next_sg(mdev);
1505
1506         /* sync target done with resync.  Explicitly notify peer, even though
1507          * it should (at least for non-empty resyncs) already know itself. */
1508         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1509                 drbd_send_state(mdev);
1510
1511         /* free tl_hash if we Got thawed and are C_STANDALONE */
1512         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1513                 drbd_free_tl_hash(mdev);
1514
1515         /* Upon network connection, we need to start the receiver */
1516         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1517                 drbd_thread_start(&mdev->receiver);
1518
1519         /* Terminate worker thread if we are unconfigured - it will be
1520            restarted as needed... */
1521         if (ns.disk == D_DISKLESS &&
1522             ns.conn == C_STANDALONE &&
1523             ns.role == R_SECONDARY) {
1524                 if (os.aftr_isp != ns.aftr_isp)
1525                         resume_next_sg(mdev);
1526                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1527                 if (test_bit(DEVICE_DYING, &mdev->flags))
1528                         drbd_thread_stop_nowait(&mdev->worker);
1529         }
1530
1531         drbd_md_sync(mdev);
1532 }
1533
1534
1535 static int drbd_thread_setup(void *arg)
1536 {
1537         struct drbd_thread *thi = (struct drbd_thread *) arg;
1538         struct drbd_conf *mdev = thi->mdev;
1539         unsigned long flags;
1540         int retval;
1541
1542 restart:
1543         retval = thi->function(thi);
1544
1545         spin_lock_irqsave(&thi->t_lock, flags);
1546
1547         /* if the receiver has been "Exiting", the last thing it did
1548          * was set the conn state to "StandAlone",
1549          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1550          * and receiver thread will be "started".
1551          * drbd_thread_start needs to set "Restarting" in that case.
1552          * t_state check and assignment needs to be within the same spinlock,
1553          * so either thread_start sees Exiting, and can remap to Restarting,
1554          * or thread_start see None, and can proceed as normal.
1555          */
1556
1557         if (thi->t_state == Restarting) {
1558                 dev_info(DEV, "Restarting %s\n", current->comm);
1559                 thi->t_state = Running;
1560                 spin_unlock_irqrestore(&thi->t_lock, flags);
1561                 goto restart;
1562         }
1563
1564         thi->task = NULL;
1565         thi->t_state = None;
1566         smp_mb();
1567         complete(&thi->stop);
1568         spin_unlock_irqrestore(&thi->t_lock, flags);
1569
1570         dev_info(DEV, "Terminating %s\n", current->comm);
1571
1572         /* Release mod reference taken when thread was started */
1573         module_put(THIS_MODULE);
1574         return retval;
1575 }
1576
1577 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1578                       int (*func) (struct drbd_thread *))
1579 {
1580         spin_lock_init(&thi->t_lock);
1581         thi->task    = NULL;
1582         thi->t_state = None;
1583         thi->function = func;
1584         thi->mdev = mdev;
1585 }
1586
1587 int drbd_thread_start(struct drbd_thread *thi)
1588 {
1589         struct drbd_conf *mdev = thi->mdev;
1590         struct task_struct *nt;
1591         unsigned long flags;
1592
1593         const char *me =
1594                 thi == &mdev->receiver ? "receiver" :
1595                 thi == &mdev->asender  ? "asender"  :
1596                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1597
1598         /* is used from state engine doing drbd_thread_stop_nowait,
1599          * while holding the req lock irqsave */
1600         spin_lock_irqsave(&thi->t_lock, flags);
1601
1602         switch (thi->t_state) {
1603         case None:
1604                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1605                                 me, current->comm, current->pid);
1606
1607                 /* Get ref on module for thread - this is released when thread exits */
1608                 if (!try_module_get(THIS_MODULE)) {
1609                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1610                         spin_unlock_irqrestore(&thi->t_lock, flags);
1611                         return FALSE;
1612                 }
1613
1614                 init_completion(&thi->stop);
1615                 D_ASSERT(thi->task == NULL);
1616                 thi->reset_cpu_mask = 1;
1617                 thi->t_state = Running;
1618                 spin_unlock_irqrestore(&thi->t_lock, flags);
1619                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1620
1621                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1622                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1623
1624                 if (IS_ERR(nt)) {
1625                         dev_err(DEV, "Couldn't start thread\n");
1626
1627                         module_put(THIS_MODULE);
1628                         return FALSE;
1629                 }
1630                 spin_lock_irqsave(&thi->t_lock, flags);
1631                 thi->task = nt;
1632                 thi->t_state = Running;
1633                 spin_unlock_irqrestore(&thi->t_lock, flags);
1634                 wake_up_process(nt);
1635                 break;
1636         case Exiting:
1637                 thi->t_state = Restarting;
1638                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1639                                 me, current->comm, current->pid);
1640                 /* fall through */
1641         case Running:
1642         case Restarting:
1643         default:
1644                 spin_unlock_irqrestore(&thi->t_lock, flags);
1645                 break;
1646         }
1647
1648         return TRUE;
1649 }
1650
1651
1652 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1653 {
1654         unsigned long flags;
1655
1656         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1657
1658         /* may be called from state engine, holding the req lock irqsave */
1659         spin_lock_irqsave(&thi->t_lock, flags);
1660
1661         if (thi->t_state == None) {
1662                 spin_unlock_irqrestore(&thi->t_lock, flags);
1663                 if (restart)
1664                         drbd_thread_start(thi);
1665                 return;
1666         }
1667
1668         if (thi->t_state != ns) {
1669                 if (thi->task == NULL) {
1670                         spin_unlock_irqrestore(&thi->t_lock, flags);
1671                         return;
1672                 }
1673
1674                 thi->t_state = ns;
1675                 smp_mb();
1676                 init_completion(&thi->stop);
1677                 if (thi->task != current)
1678                         force_sig(DRBD_SIGKILL, thi->task);
1679
1680         }
1681
1682         spin_unlock_irqrestore(&thi->t_lock, flags);
1683
1684         if (wait)
1685                 wait_for_completion(&thi->stop);
1686 }
1687
1688 #ifdef CONFIG_SMP
1689 /**
1690  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1691  * @mdev:       DRBD device.
1692  *
1693  * Forces all threads of a device onto the same CPU. This is beneficial for
1694  * DRBD's performance. May be overwritten by user's configuration.
1695  */
1696 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1697 {
1698         int ord, cpu;
1699
1700         /* user override. */
1701         if (cpumask_weight(mdev->cpu_mask))
1702                 return;
1703
1704         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1705         for_each_online_cpu(cpu) {
1706                 if (ord-- == 0) {
1707                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1708                         return;
1709                 }
1710         }
1711         /* should not be reached */
1712         cpumask_setall(mdev->cpu_mask);
1713 }
1714
1715 /**
1716  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1717  * @mdev:       DRBD device.
1718  *
1719  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1720  * prematurely.
1721  */
1722 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1723 {
1724         struct task_struct *p = current;
1725         struct drbd_thread *thi =
1726                 p == mdev->asender.task  ? &mdev->asender  :
1727                 p == mdev->receiver.task ? &mdev->receiver :
1728                 p == mdev->worker.task   ? &mdev->worker   :
1729                 NULL;
1730         ERR_IF(thi == NULL)
1731                 return;
1732         if (!thi->reset_cpu_mask)
1733                 return;
1734         thi->reset_cpu_mask = 0;
1735         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1736 }
1737 #endif
1738
1739 /* the appropriate socket mutex must be held already */
1740 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1741                           enum drbd_packets cmd, struct p_header80 *h,
1742                           size_t size, unsigned msg_flags)
1743 {
1744         int sent, ok;
1745
1746         ERR_IF(!h) return FALSE;
1747         ERR_IF(!size) return FALSE;
1748
1749         h->magic   = BE_DRBD_MAGIC;
1750         h->command = cpu_to_be16(cmd);
1751         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1752
1753         sent = drbd_send(mdev, sock, h, size, msg_flags);
1754
1755         ok = (sent == size);
1756         if (!ok)
1757                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1758                     cmdname(cmd), (int)size, sent);
1759         return ok;
1760 }
1761
1762 /* don't pass the socket. we may only look at it
1763  * when we hold the appropriate socket mutex.
1764  */
1765 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1766                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1767 {
1768         int ok = 0;
1769         struct socket *sock;
1770
1771         if (use_data_socket) {
1772                 mutex_lock(&mdev->data.mutex);
1773                 sock = mdev->data.socket;
1774         } else {
1775                 mutex_lock(&mdev->meta.mutex);
1776                 sock = mdev->meta.socket;
1777         }
1778
1779         /* drbd_disconnect() could have called drbd_free_sock()
1780          * while we were waiting in down()... */
1781         if (likely(sock != NULL))
1782                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1783
1784         if (use_data_socket)
1785                 mutex_unlock(&mdev->data.mutex);
1786         else
1787                 mutex_unlock(&mdev->meta.mutex);
1788         return ok;
1789 }
1790
1791 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1792                    size_t size)
1793 {
1794         struct p_header80 h;
1795         int ok;
1796
1797         h.magic   = BE_DRBD_MAGIC;
1798         h.command = cpu_to_be16(cmd);
1799         h.length  = cpu_to_be16(size);
1800
1801         if (!drbd_get_data_sock(mdev))
1802                 return 0;
1803
1804         ok = (sizeof(h) ==
1805                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1806         ok = ok && (size ==
1807                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1808
1809         drbd_put_data_sock(mdev);
1810
1811         return ok;
1812 }
1813
1814 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1815 {
1816         struct p_rs_param_95 *p;
1817         struct socket *sock;
1818         int size, rv;
1819         const int apv = mdev->agreed_pro_version;
1820
1821         size = apv <= 87 ? sizeof(struct p_rs_param)
1822                 : apv == 88 ? sizeof(struct p_rs_param)
1823                         + strlen(mdev->sync_conf.verify_alg) + 1
1824                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1825                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1826
1827         /* used from admin command context and receiver/worker context.
1828          * to avoid kmalloc, grab the socket right here,
1829          * then use the pre-allocated sbuf there */
1830         mutex_lock(&mdev->data.mutex);
1831         sock = mdev->data.socket;
1832
1833         if (likely(sock != NULL)) {
1834                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1835
1836                 p = &mdev->data.sbuf.rs_param_95;
1837
1838                 /* initialize verify_alg and csums_alg */
1839                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1840
1841                 p->rate = cpu_to_be32(sc->rate);
1842                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1843                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1844                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1845                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1846
1847                 if (apv >= 88)
1848                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1849                 if (apv >= 89)
1850                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1851
1852                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1853         } else
1854                 rv = 0; /* not ok */
1855
1856         mutex_unlock(&mdev->data.mutex);
1857
1858         return rv;
1859 }
1860
1861 int drbd_send_protocol(struct drbd_conf *mdev)
1862 {
1863         struct p_protocol *p;
1864         int size, cf, rv;
1865
1866         size = sizeof(struct p_protocol);
1867
1868         if (mdev->agreed_pro_version >= 87)
1869                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1870
1871         /* we must not recurse into our own queue,
1872          * as that is blocked during handshake */
1873         p = kmalloc(size, GFP_NOIO);
1874         if (p == NULL)
1875                 return 0;
1876
1877         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1878         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1879         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1880         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1881         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1882
1883         cf = 0;
1884         if (mdev->net_conf->want_lose)
1885                 cf |= CF_WANT_LOSE;
1886         if (mdev->net_conf->dry_run) {
1887                 if (mdev->agreed_pro_version >= 92)
1888                         cf |= CF_DRY_RUN;
1889                 else {
1890                         dev_err(DEV, "--dry-run is not supported by peer");
1891                         kfree(p);
1892                         return 0;
1893                 }
1894         }
1895         p->conn_flags    = cpu_to_be32(cf);
1896
1897         if (mdev->agreed_pro_version >= 87)
1898                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1899
1900         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1901                            (struct p_header80 *)p, size);
1902         kfree(p);
1903         return rv;
1904 }
1905
1906 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1907 {
1908         struct p_uuids p;
1909         int i;
1910
1911         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1912                 return 1;
1913
1914         for (i = UI_CURRENT; i < UI_SIZE; i++)
1915                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1916
1917         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1918         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1919         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1920         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1921         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1922         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1923
1924         put_ldev(mdev);
1925
1926         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1927                              (struct p_header80 *)&p, sizeof(p));
1928 }
1929
1930 int drbd_send_uuids(struct drbd_conf *mdev)
1931 {
1932         return _drbd_send_uuids(mdev, 0);
1933 }
1934
1935 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1936 {
1937         return _drbd_send_uuids(mdev, 8);
1938 }
1939
1940
1941 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1942 {
1943         struct p_rs_uuid p;
1944
1945         p.uuid = cpu_to_be64(val);
1946
1947         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1948                              (struct p_header80 *)&p, sizeof(p));
1949 }
1950
1951 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1952 {
1953         struct p_sizes p;
1954         sector_t d_size, u_size;
1955         int q_order_type;
1956         int ok;
1957
1958         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1959                 D_ASSERT(mdev->ldev->backing_bdev);
1960                 d_size = drbd_get_max_capacity(mdev->ldev);
1961                 u_size = mdev->ldev->dc.disk_size;
1962                 q_order_type = drbd_queue_order_type(mdev);
1963                 put_ldev(mdev);
1964         } else {
1965                 d_size = 0;
1966                 u_size = 0;
1967                 q_order_type = QUEUE_ORDERED_NONE;
1968         }
1969
1970         p.d_size = cpu_to_be64(d_size);
1971         p.u_size = cpu_to_be64(u_size);
1972         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1973         p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1974         p.queue_order_type = cpu_to_be16(q_order_type);
1975         p.dds_flags = cpu_to_be16(flags);
1976
1977         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1978                            (struct p_header80 *)&p, sizeof(p));
1979         return ok;
1980 }
1981
1982 /**
1983  * drbd_send_state() - Sends the drbd state to the peer
1984  * @mdev:       DRBD device.
1985  */
1986 int drbd_send_state(struct drbd_conf *mdev)
1987 {
1988         struct socket *sock;
1989         struct p_state p;
1990         int ok = 0;
1991
1992         /* Grab state lock so we wont send state if we're in the middle
1993          * of a cluster wide state change on another thread */
1994         drbd_state_lock(mdev);
1995
1996         mutex_lock(&mdev->data.mutex);
1997
1998         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1999         sock = mdev->data.socket;
2000
2001         if (likely(sock != NULL)) {
2002                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2003                                     (struct p_header80 *)&p, sizeof(p), 0);
2004         }
2005
2006         mutex_unlock(&mdev->data.mutex);
2007
2008         drbd_state_unlock(mdev);
2009         return ok;
2010 }
2011
2012 int drbd_send_state_req(struct drbd_conf *mdev,
2013         union drbd_state mask, union drbd_state val)
2014 {
2015         struct p_req_state p;
2016
2017         p.mask    = cpu_to_be32(mask.i);
2018         p.val     = cpu_to_be32(val.i);
2019
2020         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2021                              (struct p_header80 *)&p, sizeof(p));
2022 }
2023
2024 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
2025 {
2026         struct p_req_state_reply p;
2027
2028         p.retcode    = cpu_to_be32(retcode);
2029
2030         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2031                              (struct p_header80 *)&p, sizeof(p));
2032 }
2033
2034 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2035         struct p_compressed_bm *p,
2036         struct bm_xfer_ctx *c)
2037 {
2038         struct bitstream bs;
2039         unsigned long plain_bits;
2040         unsigned long tmp;
2041         unsigned long rl;
2042         unsigned len;
2043         unsigned toggle;
2044         int bits;
2045
2046         /* may we use this feature? */
2047         if ((mdev->sync_conf.use_rle == 0) ||
2048                 (mdev->agreed_pro_version < 90))
2049                         return 0;
2050
2051         if (c->bit_offset >= c->bm_bits)
2052                 return 0; /* nothing to do. */
2053
2054         /* use at most thus many bytes */
2055         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2056         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2057         /* plain bits covered in this code string */
2058         plain_bits = 0;
2059
2060         /* p->encoding & 0x80 stores whether the first run length is set.
2061          * bit offset is implicit.
2062          * start with toggle == 2 to be able to tell the first iteration */
2063         toggle = 2;
2064
2065         /* see how much plain bits we can stuff into one packet
2066          * using RLE and VLI. */
2067         do {
2068                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2069                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2070                 if (tmp == -1UL)
2071                         tmp = c->bm_bits;
2072                 rl = tmp - c->bit_offset;
2073
2074                 if (toggle == 2) { /* first iteration */
2075                         if (rl == 0) {
2076                                 /* the first checked bit was set,
2077                                  * store start value, */
2078                                 DCBP_set_start(p, 1);
2079                                 /* but skip encoding of zero run length */
2080                                 toggle = !toggle;
2081                                 continue;
2082                         }
2083                         DCBP_set_start(p, 0);
2084                 }
2085
2086                 /* paranoia: catch zero runlength.
2087                  * can only happen if bitmap is modified while we scan it. */
2088                 if (rl == 0) {
2089                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2090                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2091                         return -1;
2092                 }
2093
2094                 bits = vli_encode_bits(&bs, rl);
2095                 if (bits == -ENOBUFS) /* buffer full */
2096                         break;
2097                 if (bits <= 0) {
2098                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2099                         return 0;
2100                 }
2101
2102                 toggle = !toggle;
2103                 plain_bits += rl;
2104                 c->bit_offset = tmp;
2105         } while (c->bit_offset < c->bm_bits);
2106
2107         len = bs.cur.b - p->code + !!bs.cur.bit;
2108
2109         if (plain_bits < (len << 3)) {
2110                 /* incompressible with this method.
2111                  * we need to rewind both word and bit position. */
2112                 c->bit_offset -= plain_bits;
2113                 bm_xfer_ctx_bit_to_word_offset(c);
2114                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2115                 return 0;
2116         }
2117
2118         /* RLE + VLI was able to compress it just fine.
2119          * update c->word_offset. */
2120         bm_xfer_ctx_bit_to_word_offset(c);
2121
2122         /* store pad_bits */
2123         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2124
2125         return len;
2126 }
2127
2128 enum { OK, FAILED, DONE }
2129 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2130         struct p_header80 *h, struct bm_xfer_ctx *c)
2131 {
2132         struct p_compressed_bm *p = (void*)h;
2133         unsigned long num_words;
2134         int len;
2135         int ok;
2136
2137         len = fill_bitmap_rle_bits(mdev, p, c);
2138
2139         if (len < 0)
2140                 return FAILED;
2141
2142         if (len) {
2143                 DCBP_set_code(p, RLE_VLI_Bits);
2144                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2145                         sizeof(*p) + len, 0);
2146
2147                 c->packets[0]++;
2148                 c->bytes[0] += sizeof(*p) + len;
2149
2150                 if (c->bit_offset >= c->bm_bits)
2151                         len = 0; /* DONE */
2152         } else {
2153                 /* was not compressible.
2154                  * send a buffer full of plain text bits instead. */
2155                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2156                 len = num_words * sizeof(long);
2157                 if (len)
2158                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2159                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2160                                    h, sizeof(struct p_header80) + len, 0);
2161                 c->word_offset += num_words;
2162                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2163
2164                 c->packets[1]++;
2165                 c->bytes[1] += sizeof(struct p_header80) + len;
2166
2167                 if (c->bit_offset > c->bm_bits)
2168                         c->bit_offset = c->bm_bits;
2169         }
2170         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2171
2172         if (ok == DONE)
2173                 INFO_bm_xfer_stats(mdev, "send", c);
2174         return ok;
2175 }
2176
2177 /* See the comment at receive_bitmap() */
2178 int _drbd_send_bitmap(struct drbd_conf *mdev)
2179 {
2180         struct bm_xfer_ctx c;
2181         struct p_header80 *p;
2182         int ret;
2183
2184         ERR_IF(!mdev->bitmap) return FALSE;
2185
2186         /* maybe we should use some per thread scratch page,
2187          * and allocate that during initial device creation? */
2188         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2189         if (!p) {
2190                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2191                 return FALSE;
2192         }
2193
2194         if (get_ldev(mdev)) {
2195                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2196                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2197                         drbd_bm_set_all(mdev);
2198                         if (drbd_bm_write(mdev)) {
2199                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2200                                  * but otherwise process as per normal - need to tell other
2201                                  * side that a full resync is required! */
2202                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2203                         } else {
2204                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2205                                 drbd_md_sync(mdev);
2206                         }
2207                 }
2208                 put_ldev(mdev);
2209         }
2210
2211         c = (struct bm_xfer_ctx) {
2212                 .bm_bits = drbd_bm_bits(mdev),
2213                 .bm_words = drbd_bm_words(mdev),
2214         };
2215
2216         do {
2217                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2218         } while (ret == OK);
2219
2220         free_page((unsigned long) p);
2221         return (ret == DONE);
2222 }
2223
2224 int drbd_send_bitmap(struct drbd_conf *mdev)
2225 {
2226         int err;
2227
2228         if (!drbd_get_data_sock(mdev))
2229                 return -1;
2230         err = !_drbd_send_bitmap(mdev);
2231         drbd_put_data_sock(mdev);
2232         return err;
2233 }
2234
2235 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2236 {
2237         int ok;
2238         struct p_barrier_ack p;
2239
2240         p.barrier  = barrier_nr;
2241         p.set_size = cpu_to_be32(set_size);
2242
2243         if (mdev->state.conn < C_CONNECTED)
2244                 return FALSE;
2245         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2246                         (struct p_header80 *)&p, sizeof(p));
2247         return ok;
2248 }
2249
2250 /**
2251  * _drbd_send_ack() - Sends an ack packet
2252  * @mdev:       DRBD device.
2253  * @cmd:        Packet command code.
2254  * @sector:     sector, needs to be in big endian byte order
2255  * @blksize:    size in byte, needs to be in big endian byte order
2256  * @block_id:   Id, big endian byte order
2257  */
2258 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2259                           u64 sector,
2260                           u32 blksize,
2261                           u64 block_id)
2262 {
2263         int ok;
2264         struct p_block_ack p;
2265
2266         p.sector   = sector;
2267         p.block_id = block_id;
2268         p.blksize  = blksize;
2269         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2270
2271         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2272                 return FALSE;
2273         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2274                                 (struct p_header80 *)&p, sizeof(p));
2275         return ok;
2276 }
2277
2278 /* dp->sector and dp->block_id already/still in network byte order,
2279  * data_size is payload size according to dp->head,
2280  * and may need to be corrected for digest size. */
2281 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2282                      struct p_data *dp, int data_size)
2283 {
2284         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2285                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2286         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2287                               dp->block_id);
2288 }
2289
2290 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2291                      struct p_block_req *rp)
2292 {
2293         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2294 }
2295
2296 /**
2297  * drbd_send_ack() - Sends an ack packet
2298  * @mdev:       DRBD device.
2299  * @cmd:        Packet command code.
2300  * @e:          Epoch entry.
2301  */
2302 int drbd_send_ack(struct drbd_conf *mdev,
2303         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2304 {
2305         return _drbd_send_ack(mdev, cmd,
2306                               cpu_to_be64(e->sector),
2307                               cpu_to_be32(e->size),
2308                               e->block_id);
2309 }
2310
2311 /* This function misuses the block_id field to signal if the blocks
2312  * are is sync or not. */
2313 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2314                      sector_t sector, int blksize, u64 block_id)
2315 {
2316         return _drbd_send_ack(mdev, cmd,
2317                               cpu_to_be64(sector),
2318                               cpu_to_be32(blksize),
2319                               cpu_to_be64(block_id));
2320 }
2321
2322 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2323                        sector_t sector, int size, u64 block_id)
2324 {
2325         int ok;
2326         struct p_block_req p;
2327
2328         p.sector   = cpu_to_be64(sector);
2329         p.block_id = block_id;
2330         p.blksize  = cpu_to_be32(size);
2331
2332         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2333                                 (struct p_header80 *)&p, sizeof(p));
2334         return ok;
2335 }
2336
2337 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2338                             sector_t sector, int size,
2339                             void *digest, int digest_size,
2340                             enum drbd_packets cmd)
2341 {
2342         int ok;
2343         struct p_block_req p;
2344
2345         p.sector   = cpu_to_be64(sector);
2346         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2347         p.blksize  = cpu_to_be32(size);
2348
2349         p.head.magic   = BE_DRBD_MAGIC;
2350         p.head.command = cpu_to_be16(cmd);
2351         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2352
2353         mutex_lock(&mdev->data.mutex);
2354
2355         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2356         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2357
2358         mutex_unlock(&mdev->data.mutex);
2359
2360         return ok;
2361 }
2362
2363 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2364 {
2365         int ok;
2366         struct p_block_req p;
2367
2368         p.sector   = cpu_to_be64(sector);
2369         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2370         p.blksize  = cpu_to_be32(size);
2371
2372         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2373                            (struct p_header80 *)&p, sizeof(p));
2374         return ok;
2375 }
2376
2377 /* called on sndtimeo
2378  * returns FALSE if we should retry,
2379  * TRUE if we think connection is dead
2380  */
2381 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2382 {
2383         int drop_it;
2384         /* long elapsed = (long)(jiffies - mdev->last_received); */
2385
2386         drop_it =   mdev->meta.socket == sock
2387                 || !mdev->asender.task
2388                 || get_t_state(&mdev->asender) != Running
2389                 || mdev->state.conn < C_CONNECTED;
2390
2391         if (drop_it)
2392                 return TRUE;
2393
2394         drop_it = !--mdev->ko_count;
2395         if (!drop_it) {
2396                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2397                        current->comm, current->pid, mdev->ko_count);
2398                 request_ping(mdev);
2399         }
2400
2401         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2402 }
2403
2404 /* The idea of sendpage seems to be to put some kind of reference
2405  * to the page into the skb, and to hand it over to the NIC. In
2406  * this process get_page() gets called.
2407  *
2408  * As soon as the page was really sent over the network put_page()
2409  * gets called by some part of the network layer. [ NIC driver? ]
2410  *
2411  * [ get_page() / put_page() increment/decrement the count. If count
2412  *   reaches 0 the page will be freed. ]
2413  *
2414  * This works nicely with pages from FSs.
2415  * But this means that in protocol A we might signal IO completion too early!
2416  *
2417  * In order not to corrupt data during a resync we must make sure
2418  * that we do not reuse our own buffer pages (EEs) to early, therefore
2419  * we have the net_ee list.
2420  *
2421  * XFS seems to have problems, still, it submits pages with page_count == 0!
2422  * As a workaround, we disable sendpage on pages
2423  * with page_count == 0 or PageSlab.
2424  */
2425 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2426                    int offset, size_t size, unsigned msg_flags)
2427 {
2428         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2429         kunmap(page);
2430         if (sent == size)
2431                 mdev->send_cnt += size>>9;
2432         return sent == size;
2433 }
2434
2435 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2436                     int offset, size_t size, unsigned msg_flags)
2437 {
2438         mm_segment_t oldfs = get_fs();
2439         int sent, ok;
2440         int len = size;
2441
2442         /* e.g. XFS meta- & log-data is in slab pages, which have a
2443          * page_count of 0 and/or have PageSlab() set.
2444          * we cannot use send_page for those, as that does get_page();
2445          * put_page(); and would cause either a VM_BUG directly, or
2446          * __page_cache_release a page that would actually still be referenced
2447          * by someone, leading to some obscure delayed Oops somewhere else. */
2448         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2449                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2450
2451         msg_flags |= MSG_NOSIGNAL;
2452         drbd_update_congested(mdev);
2453         set_fs(KERNEL_DS);
2454         do {
2455                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2456                                                         offset, len,
2457                                                         msg_flags);
2458                 if (sent == -EAGAIN) {
2459                         if (we_should_drop_the_connection(mdev,
2460                                                           mdev->data.socket))
2461                                 break;
2462                         else
2463                                 continue;
2464                 }
2465                 if (sent <= 0) {
2466                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2467                              __func__, (int)size, len, sent);
2468                         break;
2469                 }
2470                 len    -= sent;
2471                 offset += sent;
2472         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2473         set_fs(oldfs);
2474         clear_bit(NET_CONGESTED, &mdev->flags);
2475
2476         ok = (len == 0);
2477         if (likely(ok))
2478                 mdev->send_cnt += size>>9;
2479         return ok;
2480 }
2481
2482 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2483 {
2484         struct bio_vec *bvec;
2485         int i;
2486         /* hint all but last page with MSG_MORE */
2487         __bio_for_each_segment(bvec, bio, i, 0) {
2488                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2489                                      bvec->bv_offset, bvec->bv_len,
2490                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2491                         return 0;
2492         }
2493         return 1;
2494 }
2495
2496 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2497 {
2498         struct bio_vec *bvec;
2499         int i;
2500         /* hint all but last page with MSG_MORE */
2501         __bio_for_each_segment(bvec, bio, i, 0) {
2502                 if (!_drbd_send_page(mdev, bvec->bv_page,
2503                                      bvec->bv_offset, bvec->bv_len,
2504                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2505                         return 0;
2506         }
2507         return 1;
2508 }
2509
2510 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2511 {
2512         struct page *page = e->pages;
2513         unsigned len = e->size;
2514         /* hint all but last page with MSG_MORE */
2515         page_chain_for_each(page) {
2516                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2517                 if (!_drbd_send_page(mdev, page, 0, l,
2518                                 page_chain_next(page) ? MSG_MORE : 0))
2519                         return 0;
2520                 len -= l;
2521         }
2522         return 1;
2523 }
2524
2525 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2526 {
2527         if (mdev->agreed_pro_version >= 95)
2528                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2529                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2530                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2531                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2532         else
2533                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2534 }
2535
2536 /* Used to send write requests
2537  * R_PRIMARY -> Peer    (P_DATA)
2538  */
2539 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2540 {
2541         int ok = 1;
2542         struct p_data p;
2543         unsigned int dp_flags = 0;
2544         void *dgb;
2545         int dgs;
2546
2547         if (!drbd_get_data_sock(mdev))
2548                 return 0;
2549
2550         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2551                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2552
2553         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2554                 p.head.h80.magic   = BE_DRBD_MAGIC;
2555                 p.head.h80.command = cpu_to_be16(P_DATA);
2556                 p.head.h80.length  =
2557                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2558         } else {
2559                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2560                 p.head.h95.command = cpu_to_be16(P_DATA);
2561                 p.head.h95.length  =
2562                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2563         }
2564
2565         p.sector   = cpu_to_be64(req->sector);
2566         p.block_id = (unsigned long)req;
2567         p.seq_num  = cpu_to_be32(req->seq_num =
2568                                  atomic_add_return(1, &mdev->packet_seq));
2569
2570         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2571
2572         if (mdev->state.conn >= C_SYNC_SOURCE &&
2573             mdev->state.conn <= C_PAUSED_SYNC_T)
2574                 dp_flags |= DP_MAY_SET_IN_SYNC;
2575
2576         p.dp_flags = cpu_to_be32(dp_flags);
2577         set_bit(UNPLUG_REMOTE, &mdev->flags);
2578         ok = (sizeof(p) ==
2579                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2580         if (ok && dgs) {
2581                 dgb = mdev->int_dig_out;
2582                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2583                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2584         }
2585         if (ok) {
2586                 /* For protocol A, we have to memcpy the payload into
2587                  * socket buffers, as we may complete right away
2588                  * as soon as we handed it over to tcp, at which point the data
2589                  * pages may become invalid.
2590                  *
2591                  * For data-integrity enabled, we copy it as well, so we can be
2592                  * sure that even if the bio pages may still be modified, it
2593                  * won't change the data on the wire, thus if the digest checks
2594                  * out ok after sending on this side, but does not fit on the
2595                  * receiving side, we sure have detected corruption elsewhere.
2596                  */
2597                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2598                         ok = _drbd_send_bio(mdev, req->master_bio);
2599                 else
2600                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2601
2602                 /* double check digest, sometimes buffers have been modified in flight. */
2603                 if (dgs > 0 && dgs <= 64) {
2604                         /* 64 byte, 512 bit, is the larges digest size
2605                          * currently supported in kernel crypto. */
2606                         unsigned char digest[64];
2607                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2608                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2609                                 dev_warn(DEV,
2610                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2611                                         (unsigned long long)req->sector, req->size);
2612                         }
2613                 } /* else if (dgs > 64) {
2614                      ... Be noisy about digest too large ...
2615                 } */
2616         }
2617
2618         drbd_put_data_sock(mdev);
2619
2620         return ok;
2621 }
2622
2623 /* answer packet, used to send data back for read requests:
2624  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2625  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2626  */
2627 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2628                     struct drbd_epoch_entry *e)
2629 {
2630         int ok;
2631         struct p_data p;
2632         void *dgb;
2633         int dgs;
2634
2635         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2636                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2637
2638         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2639                 p.head.h80.magic   = BE_DRBD_MAGIC;
2640                 p.head.h80.command = cpu_to_be16(cmd);
2641                 p.head.h80.length  =
2642                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2643         } else {
2644                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2645                 p.head.h95.command = cpu_to_be16(cmd);
2646                 p.head.h95.length  =
2647                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2648         }
2649
2650         p.sector   = cpu_to_be64(e->sector);
2651         p.block_id = e->block_id;
2652         /* p.seq_num  = 0;    No sequence numbers here.. */
2653
2654         /* Only called by our kernel thread.
2655          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2656          * in response to admin command or module unload.
2657          */
2658         if (!drbd_get_data_sock(mdev))
2659                 return 0;
2660
2661         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2662         if (ok && dgs) {
2663                 dgb = mdev->int_dig_out;
2664                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2665                 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2666         }
2667         if (ok)
2668                 ok = _drbd_send_zc_ee(mdev, e);
2669
2670         drbd_put_data_sock(mdev);
2671
2672         return ok;
2673 }
2674
2675 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2676 {
2677         struct p_block_desc p;
2678
2679         p.sector  = cpu_to_be64(req->sector);
2680         p.blksize = cpu_to_be32(req->size);
2681
2682         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2683 }
2684
2685 /*
2686   drbd_send distinguishes two cases:
2687
2688   Packets sent via the data socket "sock"
2689   and packets sent via the meta data socket "msock"
2690
2691                     sock                      msock
2692   -----------------+-------------------------+------------------------------
2693   timeout           conf.timeout / 2          conf.timeout / 2
2694   timeout action    send a ping via msock     Abort communication
2695                                               and close all sockets
2696 */
2697
2698 /*
2699  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2700  */
2701 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2702               void *buf, size_t size, unsigned msg_flags)
2703 {
2704         struct kvec iov;
2705         struct msghdr msg;
2706         int rv, sent = 0;
2707
2708         if (!sock)
2709                 return -1000;
2710
2711         /* THINK  if (signal_pending) return ... ? */
2712
2713         iov.iov_base = buf;
2714         iov.iov_len  = size;
2715
2716         msg.msg_name       = NULL;
2717         msg.msg_namelen    = 0;
2718         msg.msg_control    = NULL;
2719         msg.msg_controllen = 0;
2720         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2721
2722         if (sock == mdev->data.socket) {
2723                 mdev->ko_count = mdev->net_conf->ko_count;
2724                 drbd_update_congested(mdev);
2725         }
2726         do {
2727                 /* STRANGE
2728                  * tcp_sendmsg does _not_ use its size parameter at all ?
2729                  *
2730                  * -EAGAIN on timeout, -EINTR on signal.
2731                  */
2732 /* THINK
2733  * do we need to block DRBD_SIG if sock == &meta.socket ??
2734  * otherwise wake_asender() might interrupt some send_*Ack !
2735  */
2736                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2737                 if (rv == -EAGAIN) {
2738                         if (we_should_drop_the_connection(mdev, sock))
2739                                 break;
2740                         else
2741                                 continue;
2742                 }
2743                 D_ASSERT(rv != 0);
2744                 if (rv == -EINTR) {
2745                         flush_signals(current);
2746                         rv = 0;
2747                 }
2748                 if (rv < 0)
2749                         break;
2750                 sent += rv;
2751                 iov.iov_base += rv;
2752                 iov.iov_len  -= rv;
2753         } while (sent < size);
2754
2755         if (sock == mdev->data.socket)
2756                 clear_bit(NET_CONGESTED, &mdev->flags);
2757
2758         if (rv <= 0) {
2759                 if (rv != -EAGAIN) {
2760                         dev_err(DEV, "%s_sendmsg returned %d\n",
2761                             sock == mdev->meta.socket ? "msock" : "sock",
2762                             rv);
2763                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2764                 } else
2765                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2766         }
2767
2768         return sent;
2769 }
2770
2771 static int drbd_open(struct block_device *bdev, fmode_t mode)
2772 {
2773         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2774         unsigned long flags;
2775         int rv = 0;
2776
2777         mutex_lock(&drbd_main_mutex);
2778         spin_lock_irqsave(&mdev->req_lock, flags);
2779         /* to have a stable mdev->state.role
2780          * and no race with updating open_cnt */
2781
2782         if (mdev->state.role != R_PRIMARY) {
2783                 if (mode & FMODE_WRITE)
2784                         rv = -EROFS;
2785                 else if (!allow_oos)
2786                         rv = -EMEDIUMTYPE;
2787         }
2788
2789         if (!rv)
2790                 mdev->open_cnt++;
2791         spin_unlock_irqrestore(&mdev->req_lock, flags);
2792         mutex_unlock(&drbd_main_mutex);
2793
2794         return rv;
2795 }
2796
2797 static int drbd_release(struct gendisk *gd, fmode_t mode)
2798 {
2799         struct drbd_conf *mdev = gd->private_data;
2800         mutex_lock(&drbd_main_mutex);
2801         mdev->open_cnt--;
2802         mutex_unlock(&drbd_main_mutex);
2803         return 0;
2804 }
2805
2806 static void drbd_set_defaults(struct drbd_conf *mdev)
2807 {
2808         /* This way we get a compile error when sync_conf grows,
2809            and we forgot to initialize it here */
2810         mdev->sync_conf = (struct syncer_conf) {
2811                 /* .rate = */           DRBD_RATE_DEF,
2812                 /* .after = */          DRBD_AFTER_DEF,
2813                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2814                 /* .verify_alg = */     {}, 0,
2815                 /* .cpu_mask = */       {}, 0,
2816                 /* .csums_alg = */      {}, 0,
2817                 /* .use_rle = */        0,
2818                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2819                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2820                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2821                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2822                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2823                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2824         };
2825
2826         /* Have to use that way, because the layout differs between
2827            big endian and little endian */
2828         mdev->state = (union drbd_state) {
2829                 { .role = R_SECONDARY,
2830                   .peer = R_UNKNOWN,
2831                   .conn = C_STANDALONE,
2832                   .disk = D_DISKLESS,
2833                   .pdsk = D_UNKNOWN,
2834                   .susp = 0,
2835                   .susp_nod = 0,
2836                   .susp_fen = 0
2837                 } };
2838 }
2839
2840 void drbd_init_set_defaults(struct drbd_conf *mdev)
2841 {
2842         /* the memset(,0,) did most of this.
2843          * note: only assignments, no allocation in here */
2844
2845         drbd_set_defaults(mdev);
2846
2847         atomic_set(&mdev->ap_bio_cnt, 0);
2848         atomic_set(&mdev->ap_pending_cnt, 0);
2849         atomic_set(&mdev->rs_pending_cnt, 0);
2850         atomic_set(&mdev->unacked_cnt, 0);
2851         atomic_set(&mdev->local_cnt, 0);
2852         atomic_set(&mdev->net_cnt, 0);
2853         atomic_set(&mdev->packet_seq, 0);
2854         atomic_set(&mdev->pp_in_use, 0);
2855         atomic_set(&mdev->pp_in_use_by_net, 0);
2856         atomic_set(&mdev->rs_sect_in, 0);
2857         atomic_set(&mdev->rs_sect_ev, 0);
2858         atomic_set(&mdev->ap_in_flight, 0);
2859
2860         mutex_init(&mdev->md_io_mutex);
2861         mutex_init(&mdev->data.mutex);
2862         mutex_init(&mdev->meta.mutex);
2863         sema_init(&mdev->data.work.s, 0);
2864         sema_init(&mdev->meta.work.s, 0);
2865         mutex_init(&mdev->state_mutex);
2866
2867         spin_lock_init(&mdev->data.work.q_lock);
2868         spin_lock_init(&mdev->meta.work.q_lock);
2869
2870         spin_lock_init(&mdev->al_lock);
2871         spin_lock_init(&mdev->req_lock);
2872         spin_lock_init(&mdev->peer_seq_lock);
2873         spin_lock_init(&mdev->epoch_lock);
2874
2875         INIT_LIST_HEAD(&mdev->active_ee);
2876         INIT_LIST_HEAD(&mdev->sync_ee);
2877         INIT_LIST_HEAD(&mdev->done_ee);
2878         INIT_LIST_HEAD(&mdev->read_ee);
2879         INIT_LIST_HEAD(&mdev->net_ee);
2880         INIT_LIST_HEAD(&mdev->resync_reads);
2881         INIT_LIST_HEAD(&mdev->data.work.q);
2882         INIT_LIST_HEAD(&mdev->meta.work.q);
2883         INIT_LIST_HEAD(&mdev->resync_work.list);
2884         INIT_LIST_HEAD(&mdev->unplug_work.list);
2885         INIT_LIST_HEAD(&mdev->go_diskless.list);
2886         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2887         INIT_LIST_HEAD(&mdev->start_resync_work.list);
2888         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2889
2890         mdev->resync_work.cb  = w_resync_inactive;
2891         mdev->unplug_work.cb  = w_send_write_hint;
2892         mdev->go_diskless.cb  = w_go_diskless;
2893         mdev->md_sync_work.cb = w_md_sync;
2894         mdev->bm_io_work.w.cb = w_bitmap_io;
2895         init_timer(&mdev->resync_timer);
2896         init_timer(&mdev->md_sync_timer);
2897         mdev->resync_timer.function = resync_timer_fn;
2898         mdev->resync_timer.data = (unsigned long) mdev;
2899         mdev->md_sync_timer.function = md_sync_timer_fn;
2900         mdev->md_sync_timer.data = (unsigned long) mdev;
2901
2902         init_waitqueue_head(&mdev->misc_wait);
2903         init_waitqueue_head(&mdev->state_wait);
2904         init_waitqueue_head(&mdev->net_cnt_wait);
2905         init_waitqueue_head(&mdev->ee_wait);
2906         init_waitqueue_head(&mdev->al_wait);
2907         init_waitqueue_head(&mdev->seq_wait);
2908
2909         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2910         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2911         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2912
2913         mdev->agreed_pro_version = PRO_VERSION_MAX;
2914         mdev->write_ordering = WO_bdev_flush;
2915         mdev->resync_wenr = LC_FREE;
2916 }
2917
2918 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2919 {
2920         int i;
2921         if (mdev->receiver.t_state != None)
2922                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2923                                 mdev->receiver.t_state);
2924
2925         /* no need to lock it, I'm the only thread alive */
2926         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2927                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2928         mdev->al_writ_cnt  =
2929         mdev->bm_writ_cnt  =
2930         mdev->read_cnt     =
2931         mdev->recv_cnt     =
2932         mdev->send_cnt     =
2933         mdev->writ_cnt     =
2934         mdev->p_size       =
2935         mdev->rs_start     =
2936         mdev->rs_total     =
2937         mdev->rs_failed    = 0;
2938         mdev->rs_last_events = 0;
2939         mdev->rs_last_sect_ev = 0;
2940         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2941                 mdev->rs_mark_left[i] = 0;
2942                 mdev->rs_mark_time[i] = 0;
2943         }
2944         D_ASSERT(mdev->net_conf == NULL);
2945
2946         drbd_set_my_capacity(mdev, 0);
2947         if (mdev->bitmap) {
2948                 /* maybe never allocated. */
2949                 drbd_bm_resize(mdev, 0, 1);
2950                 drbd_bm_cleanup(mdev);
2951         }
2952
2953         drbd_free_resources(mdev);
2954         clear_bit(AL_SUSPENDED, &mdev->flags);
2955
2956         /*
2957          * currently we drbd_init_ee only on module load, so
2958          * we may do drbd_release_ee only on module unload!
2959          */
2960         D_ASSERT(list_empty(&mdev->active_ee));
2961         D_ASSERT(list_empty(&mdev->sync_ee));
2962         D_ASSERT(list_empty(&mdev->done_ee));
2963         D_ASSERT(list_empty(&mdev->read_ee));
2964         D_ASSERT(list_empty(&mdev->net_ee));
2965         D_ASSERT(list_empty(&mdev->resync_reads));
2966         D_ASSERT(list_empty(&mdev->data.work.q));
2967         D_ASSERT(list_empty(&mdev->meta.work.q));
2968         D_ASSERT(list_empty(&mdev->resync_work.list));
2969         D_ASSERT(list_empty(&mdev->unplug_work.list));
2970         D_ASSERT(list_empty(&mdev->go_diskless.list));
2971 }
2972
2973
2974 static void drbd_destroy_mempools(void)
2975 {
2976         struct page *page;
2977
2978         while (drbd_pp_pool) {
2979                 page = drbd_pp_pool;
2980                 drbd_pp_pool = (struct page *)page_private(page);
2981                 __free_page(page);
2982                 drbd_pp_vacant--;
2983         }
2984
2985         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2986
2987         if (drbd_ee_mempool)
2988                 mempool_destroy(drbd_ee_mempool);
2989         if (drbd_request_mempool)
2990                 mempool_destroy(drbd_request_mempool);
2991         if (drbd_ee_cache)
2992                 kmem_cache_destroy(drbd_ee_cache);
2993         if (drbd_request_cache)
2994                 kmem_cache_destroy(drbd_request_cache);
2995         if (drbd_bm_ext_cache)
2996                 kmem_cache_destroy(drbd_bm_ext_cache);
2997         if (drbd_al_ext_cache)
2998                 kmem_cache_destroy(drbd_al_ext_cache);
2999
3000         drbd_ee_mempool      = NULL;
3001         drbd_request_mempool = NULL;
3002         drbd_ee_cache        = NULL;
3003         drbd_request_cache   = NULL;
3004         drbd_bm_ext_cache    = NULL;
3005         drbd_al_ext_cache    = NULL;
3006
3007         return;
3008 }
3009
3010 static int drbd_create_mempools(void)
3011 {
3012         struct page *page;
3013         const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3014         int i;
3015
3016         /* prepare our caches and mempools */
3017         drbd_request_mempool = NULL;
3018         drbd_ee_cache        = NULL;
3019         drbd_request_cache   = NULL;
3020         drbd_bm_ext_cache    = NULL;
3021         drbd_al_ext_cache    = NULL;
3022         drbd_pp_pool         = NULL;
3023
3024         /* caches */
3025         drbd_request_cache = kmem_cache_create(
3026                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3027         if (drbd_request_cache == NULL)
3028                 goto Enomem;
3029
3030         drbd_ee_cache = kmem_cache_create(
3031                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3032         if (drbd_ee_cache == NULL)
3033                 goto Enomem;
3034
3035         drbd_bm_ext_cache = kmem_cache_create(
3036                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3037         if (drbd_bm_ext_cache == NULL)
3038                 goto Enomem;
3039
3040         drbd_al_ext_cache = kmem_cache_create(
3041                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3042         if (drbd_al_ext_cache == NULL)
3043                 goto Enomem;
3044
3045         /* mempools */
3046         drbd_request_mempool = mempool_create(number,
3047                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3048         if (drbd_request_mempool == NULL)
3049                 goto Enomem;
3050
3051         drbd_ee_mempool = mempool_create(number,
3052                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3053         if (drbd_ee_mempool == NULL)
3054                 goto Enomem;
3055
3056         /* drbd's page pool */
3057         spin_lock_init(&drbd_pp_lock);
3058
3059         for (i = 0; i < number; i++) {
3060                 page = alloc_page(GFP_HIGHUSER);
3061                 if (!page)
3062                         goto Enomem;
3063                 set_page_private(page, (unsigned long)drbd_pp_pool);
3064                 drbd_pp_pool = page;
3065         }
3066         drbd_pp_vacant = number;
3067
3068         return 0;
3069
3070 Enomem:
3071         drbd_destroy_mempools(); /* in case we allocated some */
3072         return -ENOMEM;
3073 }
3074
3075 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3076         void *unused)
3077 {
3078         /* just so we have it.  you never know what interesting things we
3079          * might want to do here some day...
3080          */
3081
3082         return NOTIFY_DONE;
3083 }
3084
3085 static struct notifier_block drbd_notifier = {
3086         .notifier_call = drbd_notify_sys,
3087 };
3088
3089 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3090 {
3091         int rr;
3092
3093         rr = drbd_release_ee(mdev, &mdev->active_ee);
3094         if (rr)
3095                 dev_err(DEV, "%d EEs in active list found!\n", rr);
3096
3097         rr = drbd_release_ee(mdev, &mdev->sync_ee);
3098         if (rr)
3099                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3100
3101         rr = drbd_release_ee(mdev, &mdev->read_ee);
3102         if (rr)
3103                 dev_err(DEV, "%d EEs in read list found!\n", rr);
3104
3105         rr = drbd_release_ee(mdev, &mdev->done_ee);
3106         if (rr)
3107                 dev_err(DEV, "%d EEs in done list found!\n", rr);
3108
3109         rr = drbd_release_ee(mdev, &mdev->net_ee);
3110         if (rr)
3111                 dev_err(DEV, "%d EEs in net list found!\n", rr);
3112 }
3113
3114 /* caution. no locking.
3115  * currently only used from module cleanup code. */
3116 static void drbd_delete_device(unsigned int minor)
3117 {
3118         struct drbd_conf *mdev = minor_to_mdev(minor);
3119
3120         if (!mdev)
3121                 return;
3122
3123         /* paranoia asserts */
3124         if (mdev->open_cnt != 0)
3125                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3126                                 __FILE__ , __LINE__);
3127
3128         ERR_IF (!list_empty(&mdev->data.work.q)) {
3129                 struct list_head *lp;
3130                 list_for_each(lp, &mdev->data.work.q) {
3131                         dev_err(DEV, "lp = %p\n", lp);
3132                 }
3133         };
3134         /* end paranoia asserts */
3135
3136         del_gendisk(mdev->vdisk);
3137
3138         /* cleanup stuff that may have been allocated during
3139          * device (re-)configuration or state changes */
3140
3141         if (mdev->this_bdev)
3142                 bdput(mdev->this_bdev);
3143
3144         drbd_free_resources(mdev);
3145
3146         drbd_release_ee_lists(mdev);
3147
3148         /* should be free'd on disconnect? */
3149         kfree(mdev->ee_hash);
3150         /*
3151         mdev->ee_hash_s = 0;
3152         mdev->ee_hash = NULL;
3153         */
3154
3155         lc_destroy(mdev->act_log);
3156         lc_destroy(mdev->resync);
3157
3158         kfree(mdev->p_uuid);
3159         /* mdev->p_uuid = NULL; */
3160
3161         kfree(mdev->int_dig_out);
3162         kfree(mdev->int_dig_in);
3163         kfree(mdev->int_dig_vv);
3164
3165         /* cleanup the rest that has been
3166          * allocated from drbd_new_device
3167          * and actually free the mdev itself */
3168         drbd_free_mdev(mdev);
3169 }
3170
3171 static void drbd_cleanup(void)
3172 {
3173         unsigned int i;
3174
3175         unregister_reboot_notifier(&drbd_notifier);
3176
3177         /* first remove proc,
3178          * drbdsetup uses it's presence to detect
3179          * whether DRBD is loaded.
3180          * If we would get stuck in proc removal,
3181          * but have netlink already deregistered,
3182          * some drbdsetup commands may wait forever
3183          * for an answer.
3184          */
3185         if (drbd_proc)
3186                 remove_proc_entry("drbd", NULL);
3187
3188         drbd_nl_cleanup();
3189
3190         if (minor_table) {
3191                 i = minor_count;
3192                 while (i--)
3193                         drbd_delete_device(i);
3194                 drbd_destroy_mempools();
3195         }
3196
3197         kfree(minor_table);
3198
3199         unregister_blkdev(DRBD_MAJOR, "drbd");
3200
3201         printk(KERN_INFO "drbd: module cleanup done.\n");
3202 }
3203
3204 /**
3205  * drbd_congested() - Callback for pdflush
3206  * @congested_data:     User data
3207  * @bdi_bits:           Bits pdflush is currently interested in
3208  *
3209  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3210  */
3211 static int drbd_congested(void *congested_data, int bdi_bits)
3212 {
3213         struct drbd_conf *mdev = congested_data;
3214         struct request_queue *q;
3215         char reason = '-';
3216         int r = 0;
3217
3218         if (!__inc_ap_bio_cond(mdev)) {
3219                 /* DRBD has frozen IO */
3220                 r = bdi_bits;
3221                 reason = 'd';
3222                 goto out;
3223         }
3224
3225         if (get_ldev(mdev)) {
3226                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3227                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3228                 put_ldev(mdev);
3229                 if (r)
3230                         reason = 'b';
3231         }
3232
3233         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3234                 r |= (1 << BDI_async_congested);
3235                 reason = reason == 'b' ? 'a' : 'n';
3236         }
3237
3238 out:
3239         mdev->congestion_reason = reason;
3240         return r;
3241 }
3242
3243 struct drbd_conf *drbd_new_device(unsigned int minor)
3244 {
3245         struct drbd_conf *mdev;
3246         struct gendisk *disk;
3247         struct request_queue *q;
3248
3249         /* GFP_KERNEL, we are outside of all write-out paths */
3250         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3251         if (!mdev)
3252                 return NULL;
3253         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3254                 goto out_no_cpumask;
3255
3256         mdev->minor = minor;
3257
3258         drbd_init_set_defaults(mdev);
3259
3260         q = blk_alloc_queue(GFP_KERNEL);
3261         if (!q)
3262                 goto out_no_q;
3263         mdev->rq_queue = q;
3264         q->queuedata   = mdev;
3265
3266         disk = alloc_disk(1);
3267         if (!disk)
3268                 goto out_no_disk;
3269         mdev->vdisk = disk;
3270
3271         set_disk_ro(disk, TRUE);
3272
3273         disk->queue = q;
3274         disk->major = DRBD_MAJOR;
3275         disk->first_minor = minor;
3276         disk->fops = &drbd_ops;
3277         sprintf(disk->disk_name, "drbd%d", minor);
3278         disk->private_data = mdev;
3279
3280         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3281         /* we have no partitions. we contain only ourselves. */
3282         mdev->this_bdev->bd_contains = mdev->this_bdev;
3283
3284         q->backing_dev_info.congested_fn = drbd_congested;
3285         q->backing_dev_info.congested_data = mdev;
3286
3287         blk_queue_make_request(q, drbd_make_request);
3288         blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3289         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3290         blk_queue_merge_bvec(q, drbd_merge_bvec);
3291         q->queue_lock = &mdev->req_lock;
3292
3293         mdev->md_io_page = alloc_page(GFP_KERNEL);
3294         if (!mdev->md_io_page)
3295                 goto out_no_io_page;
3296
3297         if (drbd_bm_init(mdev))
3298                 goto out_no_bitmap;
3299         /* no need to lock access, we are still initializing this minor device. */
3300         if (!tl_init(mdev))
3301                 goto out_no_tl;
3302
3303         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3304         if (!mdev->app_reads_hash)
3305                 goto out_no_app_reads;
3306
3307         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3308         if (!mdev->current_epoch)
3309                 goto out_no_epoch;
3310
3311         INIT_LIST_HEAD(&mdev->current_epoch->list);
3312         mdev->epochs = 1;
3313
3314         return mdev;
3315
3316 /* out_whatever_else:
3317         kfree(mdev->current_epoch); */
3318 out_no_epoch:
3319         kfree(mdev->app_reads_hash);
3320 out_no_app_reads:
3321         tl_cleanup(mdev);
3322 out_no_tl:
3323         drbd_bm_cleanup(mdev);
3324 out_no_bitmap:
3325         __free_page(mdev->md_io_page);
3326 out_no_io_page:
3327         put_disk(disk);
3328 out_no_disk:
3329         blk_cleanup_queue(q);
3330 out_no_q:
3331         free_cpumask_var(mdev->cpu_mask);
3332 out_no_cpumask:
3333         kfree(mdev);
3334         return NULL;
3335 }
3336
3337 /* counterpart of drbd_new_device.
3338  * last part of drbd_delete_device. */
3339 void drbd_free_mdev(struct drbd_conf *mdev)
3340 {
3341         kfree(mdev->current_epoch);
3342         kfree(mdev->app_reads_hash);
3343         tl_cleanup(mdev);
3344         if (mdev->bitmap) /* should no longer be there. */
3345                 drbd_bm_cleanup(mdev);
3346         __free_page(mdev->md_io_page);
3347         put_disk(mdev->vdisk);
3348         blk_cleanup_queue(mdev->rq_queue);
3349         free_cpumask_var(mdev->cpu_mask);
3350         drbd_free_tl_hash(mdev);
3351         kfree(mdev);
3352 }
3353
3354
3355 int __init drbd_init(void)
3356 {
3357         int err;
3358
3359         if (sizeof(struct p_handshake) != 80) {
3360                 printk(KERN_ERR
3361                        "drbd: never change the size or layout "
3362                        "of the HandShake packet.\n");
3363                 return -EINVAL;
3364         }
3365
3366         if (1 > minor_count || minor_count > 255) {
3367                 printk(KERN_ERR
3368                         "drbd: invalid minor_count (%d)\n", minor_count);
3369 #ifdef MODULE
3370                 return -EINVAL;
3371 #else
3372                 minor_count = 8;
3373 #endif
3374         }
3375
3376         err = drbd_nl_init();
3377         if (err)
3378                 return err;
3379
3380         err = register_blkdev(DRBD_MAJOR, "drbd");
3381         if (err) {
3382                 printk(KERN_ERR
3383                        "drbd: unable to register block device major %d\n",
3384                        DRBD_MAJOR);
3385                 return err;
3386         }
3387
3388         register_reboot_notifier(&drbd_notifier);
3389
3390         /*
3391          * allocate all necessary structs
3392          */
3393         err = -ENOMEM;
3394
3395         init_waitqueue_head(&drbd_pp_wait);
3396
3397         drbd_proc = NULL; /* play safe for drbd_cleanup */
3398         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3399                                 GFP_KERNEL);
3400         if (!minor_table)
3401                 goto Enomem;
3402
3403         err = drbd_create_mempools();
3404         if (err)
3405                 goto Enomem;
3406
3407         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3408         if (!drbd_proc) {
3409                 printk(KERN_ERR "drbd: unable to register proc file\n");
3410                 goto Enomem;
3411         }
3412
3413         rwlock_init(&global_state_lock);
3414
3415         printk(KERN_INFO "drbd: initialized. "
3416                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3417                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3418         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3419         printk(KERN_INFO "drbd: registered as block device major %d\n",
3420                 DRBD_MAJOR);
3421         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3422
3423         return 0; /* Success! */
3424
3425 Enomem:
3426         drbd_cleanup();
3427         if (err == -ENOMEM)
3428                 /* currently always the case */
3429                 printk(KERN_ERR "drbd: ran out of memory\n");
3430         else
3431                 printk(KERN_ERR "drbd: initialization failure\n");
3432         return err;
3433 }
3434
3435 void drbd_free_bc(struct drbd_backing_dev *ldev)
3436 {
3437         if (ldev == NULL)
3438                 return;
3439
3440         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3441         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3442
3443         kfree(ldev);
3444 }
3445
3446 void drbd_free_sock(struct drbd_conf *mdev)
3447 {
3448         if (mdev->data.socket) {
3449                 mutex_lock(&mdev->data.mutex);
3450                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3451                 sock_release(mdev->data.socket);
3452                 mdev->data.socket = NULL;
3453                 mutex_unlock(&mdev->data.mutex);
3454         }
3455         if (mdev->meta.socket) {
3456                 mutex_lock(&mdev->meta.mutex);
3457                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3458                 sock_release(mdev->meta.socket);
3459                 mdev->meta.socket = NULL;
3460                 mutex_unlock(&mdev->meta.mutex);
3461         }
3462 }
3463
3464
3465 void drbd_free_resources(struct drbd_conf *mdev)
3466 {
3467         crypto_free_hash(mdev->csums_tfm);
3468         mdev->csums_tfm = NULL;
3469         crypto_free_hash(mdev->verify_tfm);
3470         mdev->verify_tfm = NULL;
3471         crypto_free_hash(mdev->cram_hmac_tfm);
3472         mdev->cram_hmac_tfm = NULL;
3473         crypto_free_hash(mdev->integrity_w_tfm);
3474         mdev->integrity_w_tfm = NULL;
3475         crypto_free_hash(mdev->integrity_r_tfm);
3476         mdev->integrity_r_tfm = NULL;
3477
3478         drbd_free_sock(mdev);
3479
3480         __no_warn(local,
3481                   drbd_free_bc(mdev->ldev);
3482                   mdev->ldev = NULL;);
3483 }
3484
3485 /* meta data management */
3486
3487 struct meta_data_on_disk {
3488         u64 la_size;           /* last agreed size. */
3489         u64 uuid[UI_SIZE];   /* UUIDs. */
3490         u64 device_uuid;
3491         u64 reserved_u64_1;
3492         u32 flags;             /* MDF */
3493         u32 magic;
3494         u32 md_size_sect;
3495         u32 al_offset;         /* offset to this block */
3496         u32 al_nr_extents;     /* important for restoring the AL */
3497               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3498         u32 bm_offset;         /* offset to the bitmap, from here */
3499         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3500         u32 reserved_u32[4];
3501
3502 } __packed;
3503
3504 /**
3505  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3506  * @mdev:       DRBD device.
3507  */
3508 void drbd_md_sync(struct drbd_conf *mdev)
3509 {
3510         struct meta_data_on_disk *buffer;
3511         sector_t sector;
3512         int i;
3513
3514         del_timer(&mdev->md_sync_timer);
3515         /* timer may be rearmed by drbd_md_mark_dirty() now. */
3516         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3517                 return;
3518
3519         /* We use here D_FAILED and not D_ATTACHING because we try to write
3520          * metadata even if we detach due to a disk failure! */
3521         if (!get_ldev_if_state(mdev, D_FAILED))
3522                 return;
3523
3524         mutex_lock(&mdev->md_io_mutex);
3525         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3526         memset(buffer, 0, 512);
3527
3528         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3529         for (i = UI_CURRENT; i < UI_SIZE; i++)
3530                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3531         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3532         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3533
3534         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3535         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3536         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3537         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3538         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3539
3540         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3541
3542         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3543         sector = mdev->ldev->md.md_offset;
3544
3545         if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3546                 /* this was a try anyways ... */
3547                 dev_err(DEV, "meta data update failed!\n");
3548                 drbd_chk_io_error(mdev, 1, TRUE);
3549         }
3550
3551         /* Update mdev->ldev->md.la_size_sect,
3552          * since we updated it on metadata. */
3553         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3554
3555         mutex_unlock(&mdev->md_io_mutex);
3556         put_ldev(mdev);
3557 }
3558
3559 /**
3560  * drbd_md_read() - Reads in the meta data super block
3561  * @mdev:       DRBD device.
3562  * @bdev:       Device from which the meta data should be read in.
3563  *
3564  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3565  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3566  */
3567 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3568 {
3569         struct meta_data_on_disk *buffer;
3570         int i, rv = NO_ERROR;
3571
3572         if (!get_ldev_if_state(mdev, D_ATTACHING))
3573                 return ERR_IO_MD_DISK;
3574
3575         mutex_lock(&mdev->md_io_mutex);
3576         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3577
3578         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3579                 /* NOTE: cant do normal error processing here as this is
3580                    called BEFORE disk is attached */
3581                 dev_err(DEV, "Error while reading metadata.\n");
3582                 rv = ERR_IO_MD_DISK;
3583                 goto err;
3584         }
3585
3586         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3587                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3588                 rv = ERR_MD_INVALID;
3589                 goto err;
3590         }
3591         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3592                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3593                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3594                 rv = ERR_MD_INVALID;
3595                 goto err;
3596         }
3597         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3598                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3599                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3600                 rv = ERR_MD_INVALID;
3601                 goto err;
3602         }
3603         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3604                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3605                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3606                 rv = ERR_MD_INVALID;
3607                 goto err;
3608         }
3609
3610         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3611                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3612                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3613                 rv = ERR_MD_INVALID;
3614                 goto err;
3615         }
3616
3617         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3618         for (i = UI_CURRENT; i < UI_SIZE; i++)
3619                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3620         bdev->md.flags = be32_to_cpu(buffer->flags);
3621         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3622         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3623
3624         if (mdev->sync_conf.al_extents < 7)
3625                 mdev->sync_conf.al_extents = 127;
3626
3627  err:
3628         mutex_unlock(&mdev->md_io_mutex);
3629         put_ldev(mdev);
3630
3631         return rv;
3632 }
3633
3634 static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3635 {
3636         static char *uuid_str[UI_EXTENDED_SIZE] = {
3637                 [UI_CURRENT] = "CURRENT",
3638                 [UI_BITMAP] = "BITMAP",
3639                 [UI_HISTORY_START] = "HISTORY_START",
3640                 [UI_HISTORY_END] = "HISTORY_END",
3641                 [UI_SIZE] = "SIZE",
3642                 [UI_FLAGS] = "FLAGS",
3643         };
3644
3645         if (index >= UI_EXTENDED_SIZE) {
3646                 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3647                 return;
3648         }
3649
3650         dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3651                  uuid_str[index],
3652                  (unsigned long long)mdev->ldev->md.uuid[index]);
3653 }
3654
3655
3656 /**
3657  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3658  * @mdev:       DRBD device.
3659  *
3660  * Call this function if you change anything that should be written to
3661  * the meta-data super block. This function sets MD_DIRTY, and starts a
3662  * timer that ensures that within five seconds you have to call drbd_md_sync().
3663  */
3664 #ifdef DEBUG
3665 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3666 {
3667         if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3668                 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3669                 mdev->last_md_mark_dirty.line = line;
3670                 mdev->last_md_mark_dirty.func = func;
3671         }
3672 }
3673 #else
3674 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3675 {
3676         if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3677                 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3678 }
3679 #endif
3680
3681 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3682 {
3683         int i;
3684
3685         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3686                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3687                 debug_drbd_uuid(mdev, i+1);
3688         }
3689 }
3690
3691 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3692 {
3693         if (idx == UI_CURRENT) {
3694                 if (mdev->state.role == R_PRIMARY)
3695                         val |= 1;
3696                 else
3697                         val &= ~((u64)1);
3698
3699                 drbd_set_ed_uuid(mdev, val);
3700         }
3701
3702         mdev->ldev->md.uuid[idx] = val;
3703         debug_drbd_uuid(mdev, idx);
3704         drbd_md_mark_dirty(mdev);
3705 }
3706
3707
3708 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3709 {
3710         if (mdev->ldev->md.uuid[idx]) {
3711                 drbd_uuid_move_history(mdev);
3712                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3713                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3714         }
3715         _drbd_uuid_set(mdev, idx, val);
3716 }
3717
3718 /**
3719  * drbd_uuid_new_current() - Creates a new current UUID
3720  * @mdev:       DRBD device.
3721  *
3722  * Creates a new current UUID, and rotates the old current UUID into
3723  * the bitmap slot. Causes an incremental resync upon next connect.
3724  */
3725 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3726 {
3727         u64 val;
3728
3729         dev_info(DEV, "Creating new current UUID\n");
3730         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3731         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3732         debug_drbd_uuid(mdev, UI_BITMAP);
3733
3734         get_random_bytes(&val, sizeof(u64));
3735         _drbd_uuid_set(mdev, UI_CURRENT, val);
3736         /* get it to stable storage _now_ */
3737         drbd_md_sync(mdev);
3738 }
3739
3740 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3741 {
3742         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3743                 return;
3744
3745         if (val == 0) {
3746                 drbd_uuid_move_history(mdev);
3747                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3748                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3749                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3750                 debug_drbd_uuid(mdev, UI_BITMAP);
3751         } else {
3752                 if (mdev->ldev->md.uuid[UI_BITMAP])
3753                         dev_warn(DEV, "bm UUID already set");
3754
3755                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3756                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3757
3758                 debug_drbd_uuid(mdev, UI_BITMAP);
3759         }
3760         drbd_md_mark_dirty(mdev);
3761 }
3762
3763 /**
3764  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3765  * @mdev:       DRBD device.
3766  *
3767  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3768  */
3769 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3770 {
3771         int rv = -EIO;
3772
3773         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3774                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3775                 drbd_md_sync(mdev);
3776                 drbd_bm_set_all(mdev);
3777
3778                 rv = drbd_bm_write(mdev);
3779
3780                 if (!rv) {
3781                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3782                         drbd_md_sync(mdev);
3783                 }
3784
3785                 put_ldev(mdev);
3786         }
3787
3788         return rv;
3789 }
3790
3791 /**
3792  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3793  * @mdev:       DRBD device.
3794  *
3795  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3796  */
3797 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3798 {
3799         int rv = -EIO;
3800
3801         drbd_resume_al(mdev);
3802         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3803                 drbd_bm_clear_all(mdev);
3804                 rv = drbd_bm_write(mdev);
3805                 put_ldev(mdev);
3806         }
3807
3808         return rv;
3809 }
3810
3811 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3812 {
3813         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3814         int rv;
3815
3816         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3817
3818         drbd_bm_lock(mdev, work->why);
3819         rv = work->io_fn(mdev);
3820         drbd_bm_unlock(mdev);
3821
3822         clear_bit(BITMAP_IO, &mdev->flags);
3823         smp_mb__after_clear_bit();
3824         wake_up(&mdev->misc_wait);
3825
3826         if (work->done)
3827                 work->done(mdev, rv);
3828
3829         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3830         work->why = NULL;
3831
3832         return 1;
3833 }
3834
3835 void drbd_ldev_destroy(struct drbd_conf *mdev)
3836 {
3837         lc_destroy(mdev->resync);
3838         mdev->resync = NULL;
3839         lc_destroy(mdev->act_log);
3840         mdev->act_log = NULL;
3841         __no_warn(local,
3842                 drbd_free_bc(mdev->ldev);
3843                 mdev->ldev = NULL;);
3844
3845         if (mdev->md_io_tmpp) {
3846                 __free_page(mdev->md_io_tmpp);
3847                 mdev->md_io_tmpp = NULL;
3848         }
3849         clear_bit(GO_DISKLESS, &mdev->flags);
3850 }
3851
3852 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3853 {
3854         D_ASSERT(mdev->state.disk == D_FAILED);
3855         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3856          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3857          * the protected members anymore, though, so once put_ldev reaches zero
3858          * again, it will be safe to free them. */
3859         drbd_force_state(mdev, NS(disk, D_DISKLESS));
3860         return 1;
3861 }
3862
3863 void drbd_go_diskless(struct drbd_conf *mdev)
3864 {
3865         D_ASSERT(mdev->state.disk == D_FAILED);
3866         if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3867                 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3868 }
3869
3870 /**
3871  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3872  * @mdev:       DRBD device.
3873  * @io_fn:      IO callback to be called when bitmap IO is possible
3874  * @done:       callback to be called after the bitmap IO was performed
3875  * @why:        Descriptive text of the reason for doing the IO
3876  *
3877  * While IO on the bitmap happens we freeze application IO thus we ensure
3878  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3879  * called from worker context. It MUST NOT be used while a previous such
3880  * work is still pending!
3881  */
3882 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3883                           int (*io_fn)(struct drbd_conf *),
3884                           void (*done)(struct drbd_conf *, int),
3885                           char *why)
3886 {
3887         D_ASSERT(current == mdev->worker.task);
3888
3889         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3890         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3891         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3892         if (mdev->bm_io_work.why)
3893                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3894                         why, mdev->bm_io_work.why);
3895
3896         mdev->bm_io_work.io_fn = io_fn;
3897         mdev->bm_io_work.done = done;
3898         mdev->bm_io_work.why = why;
3899
3900         spin_lock_irq(&mdev->req_lock);
3901         set_bit(BITMAP_IO, &mdev->flags);
3902         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3903                 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3904                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3905         }
3906         spin_unlock_irq(&mdev->req_lock);
3907 }
3908
3909 /**
3910  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3911  * @mdev:       DRBD device.
3912  * @io_fn:      IO callback to be called when bitmap IO is possible
3913  * @why:        Descriptive text of the reason for doing the IO
3914  *
3915  * freezes application IO while that the actual IO operations runs. This
3916  * functions MAY NOT be called from worker context.
3917  */
3918 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3919 {
3920         int rv;
3921
3922         D_ASSERT(current != mdev->worker.task);
3923
3924         drbd_suspend_io(mdev);
3925
3926         drbd_bm_lock(mdev, why);
3927         rv = io_fn(mdev);
3928         drbd_bm_unlock(mdev);
3929
3930         drbd_resume_io(mdev);
3931
3932         return rv;
3933 }
3934
3935 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3936 {
3937         if ((mdev->ldev->md.flags & flag) != flag) {
3938                 drbd_md_mark_dirty(mdev);
3939                 mdev->ldev->md.flags |= flag;
3940         }
3941 }
3942
3943 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3944 {
3945         if ((mdev->ldev->md.flags & flag) != 0) {
3946                 drbd_md_mark_dirty(mdev);
3947                 mdev->ldev->md.flags &= ~flag;
3948         }
3949 }
3950 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3951 {
3952         return (bdev->md.flags & flag) != 0;
3953 }
3954
3955 static void md_sync_timer_fn(unsigned long data)
3956 {
3957         struct drbd_conf *mdev = (struct drbd_conf *) data;
3958
3959         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3960 }
3961
3962 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3963 {
3964         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3965 #ifdef DEBUG
3966         dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3967                 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3968 #endif
3969         drbd_md_sync(mdev);
3970         return 1;
3971 }
3972
3973 #ifdef CONFIG_DRBD_FAULT_INJECTION
3974 /* Fault insertion support including random number generator shamelessly
3975  * stolen from kernel/rcutorture.c */
3976 struct fault_random_state {
3977         unsigned long state;
3978         unsigned long count;
3979 };
3980
3981 #define FAULT_RANDOM_MULT 39916801  /* prime */
3982 #define FAULT_RANDOM_ADD        479001701 /* prime */
3983 #define FAULT_RANDOM_REFRESH 10000
3984
3985 /*
3986  * Crude but fast random-number generator.  Uses a linear congruential
3987  * generator, with occasional help from get_random_bytes().
3988  */
3989 static unsigned long
3990 _drbd_fault_random(struct fault_random_state *rsp)
3991 {
3992         long refresh;
3993
3994         if (!rsp->count--) {
3995                 get_random_bytes(&refresh, sizeof(refresh));
3996                 rsp->state += refresh;
3997                 rsp->count = FAULT_RANDOM_REFRESH;
3998         }
3999         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4000         return swahw32(rsp->state);
4001 }
4002
4003 static char *
4004 _drbd_fault_str(unsigned int type) {
4005         static char *_faults[] = {
4006                 [DRBD_FAULT_MD_WR] = "Meta-data write",
4007                 [DRBD_FAULT_MD_RD] = "Meta-data read",
4008                 [DRBD_FAULT_RS_WR] = "Resync write",
4009                 [DRBD_FAULT_RS_RD] = "Resync read",
4010                 [DRBD_FAULT_DT_WR] = "Data write",
4011                 [DRBD_FAULT_DT_RD] = "Data read",
4012                 [DRBD_FAULT_DT_RA] = "Data read ahead",
4013                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4014                 [DRBD_FAULT_AL_EE] = "EE allocation",
4015                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4016         };
4017
4018         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4019 }
4020
4021 unsigned int
4022 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4023 {
4024         static struct fault_random_state rrs = {0, 0};
4025
4026         unsigned int ret = (
4027                 (fault_devs == 0 ||
4028                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4029                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4030
4031         if (ret) {
4032                 fault_count++;
4033
4034                 if (__ratelimit(&drbd_ratelimit_state))
4035                         dev_warn(DEV, "***Simulating %s failure\n",
4036                                 _drbd_fault_str(type));
4037         }
4038
4039         return ret;
4040 }
4041 #endif
4042
4043 const char *drbd_buildtag(void)
4044 {
4045         /* DRBD built from external sources has here a reference to the
4046            git hash of the source code. */
4047
4048         static char buildtag[38] = "\0uilt-in";
4049
4050         if (buildtag[0] == 0) {
4051 #ifdef CONFIG_MODULES
4052                 if (THIS_MODULE != NULL)
4053                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4054                 else
4055 #endif
4056                         buildtag[0] = 'b';
4057         }
4058
4059         return buildtag;
4060 }
4061
4062 module_init(drbd_init)
4063 module_exit(drbd_cleanup)
4064
4065 EXPORT_SYMBOL(drbd_conn_str);
4066 EXPORT_SYMBOL(drbd_role_str);
4067 EXPORT_SYMBOL(drbd_disk_str);
4068 EXPORT_SYMBOL(drbd_set_st_err_str);