dm mpath: reduce memory pressure when requeuing
[pandora-kernel.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm.h"
11 #include "dm-path-selector.h"
12 #include "dm-uevent.h"
13
14 #include <linux/ctype.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/pagemap.h>
19 #include <linux/slab.h>
20 #include <linux/time.h>
21 #include <linux/workqueue.h>
22 #include <linux/delay.h>
23 #include <scsi/scsi_dh.h>
24 #include <linux/atomic.h>
25
26 #define DM_MSG_PREFIX "multipath"
27 #define DM_PG_INIT_DELAY_MSECS 2000
28 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
29
30 /* Path properties */
31 struct pgpath {
32         struct list_head list;
33
34         struct priority_group *pg;      /* Owning PG */
35         unsigned is_active;             /* Path status */
36         unsigned fail_count;            /* Cumulative failure count */
37
38         struct dm_path path;
39         struct delayed_work activate_path;
40 };
41
42 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
43
44 /*
45  * Paths are grouped into Priority Groups and numbered from 1 upwards.
46  * Each has a path selector which controls which path gets used.
47  */
48 struct priority_group {
49         struct list_head list;
50
51         struct multipath *m;            /* Owning multipath instance */
52         struct path_selector ps;
53
54         unsigned pg_num;                /* Reference number */
55         unsigned bypassed;              /* Temporarily bypass this PG? */
56
57         unsigned nr_pgpaths;            /* Number of paths in PG */
58         struct list_head pgpaths;
59 };
60
61 /* Multipath context */
62 struct multipath {
63         struct list_head list;
64         struct dm_target *ti;
65
66         const char *hw_handler_name;
67         char *hw_handler_params;
68
69         spinlock_t lock;
70
71         unsigned nr_priority_groups;
72         struct list_head priority_groups;
73
74         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
75
76         unsigned pg_init_required;      /* pg_init needs calling? */
77         unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
78         unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
79
80         unsigned nr_valid_paths;        /* Total number of usable paths */
81         struct pgpath *current_pgpath;
82         struct priority_group *current_pg;
83         struct priority_group *next_pg; /* Switch to this PG if set */
84         unsigned repeat_count;          /* I/Os left before calling PS again */
85
86         unsigned queue_io:1;            /* Must we queue all I/O? */
87         unsigned queue_if_no_path:1;    /* Queue I/O if last path fails? */
88         unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
89         unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
90         unsigned pg_init_disabled:1;    /* pg_init is not currently allowed */
91
92         unsigned pg_init_retries;       /* Number of times to retry pg_init */
93         unsigned pg_init_count;         /* Number of times pg_init called */
94         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
95
96         struct work_struct trigger_event;
97
98         /*
99          * We must use a mempool of dm_mpath_io structs so that we
100          * can resubmit bios on error.
101          */
102         mempool_t *mpio_pool;
103
104         struct mutex work_mutex;
105 };
106
107 /*
108  * Context information attached to each bio we process.
109  */
110 struct dm_mpath_io {
111         struct pgpath *pgpath;
112         size_t nr_bytes;
113 };
114
115 typedef int (*action_fn) (struct pgpath *pgpath);
116
117 static struct kmem_cache *_mpio_cache;
118
119 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
120 static void trigger_event(struct work_struct *work);
121 static void activate_path(struct work_struct *work);
122 static int __pgpath_busy(struct pgpath *pgpath);
123
124
125 /*-----------------------------------------------
126  * Allocation routines
127  *-----------------------------------------------*/
128
129 static struct pgpath *alloc_pgpath(void)
130 {
131         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
132
133         if (pgpath) {
134                 pgpath->is_active = 1;
135                 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
136         }
137
138         return pgpath;
139 }
140
141 static void free_pgpath(struct pgpath *pgpath)
142 {
143         kfree(pgpath);
144 }
145
146 static struct priority_group *alloc_priority_group(void)
147 {
148         struct priority_group *pg;
149
150         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
151
152         if (pg)
153                 INIT_LIST_HEAD(&pg->pgpaths);
154
155         return pg;
156 }
157
158 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
159 {
160         struct pgpath *pgpath, *tmp;
161         struct multipath *m = ti->private;
162
163         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
164                 list_del(&pgpath->list);
165                 if (m->hw_handler_name)
166                         scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
167                 dm_put_device(ti, pgpath->path.dev);
168                 free_pgpath(pgpath);
169         }
170 }
171
172 static void free_priority_group(struct priority_group *pg,
173                                 struct dm_target *ti)
174 {
175         struct path_selector *ps = &pg->ps;
176
177         if (ps->type) {
178                 ps->type->destroy(ps);
179                 dm_put_path_selector(ps->type);
180         }
181
182         free_pgpaths(&pg->pgpaths, ti);
183         kfree(pg);
184 }
185
186 static struct multipath *alloc_multipath(struct dm_target *ti)
187 {
188         struct multipath *m;
189         unsigned min_ios = dm_get_reserved_rq_based_ios();
190
191         m = kzalloc(sizeof(*m), GFP_KERNEL);
192         if (m) {
193                 INIT_LIST_HEAD(&m->priority_groups);
194                 spin_lock_init(&m->lock);
195                 m->queue_io = 1;
196                 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
197                 INIT_WORK(&m->trigger_event, trigger_event);
198                 init_waitqueue_head(&m->pg_init_wait);
199                 mutex_init(&m->work_mutex);
200                 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
201                 if (!m->mpio_pool) {
202                         kfree(m);
203                         return NULL;
204                 }
205                 m->ti = ti;
206                 ti->private = m;
207         }
208
209         return m;
210 }
211
212 static void free_multipath(struct multipath *m)
213 {
214         struct priority_group *pg, *tmp;
215
216         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
217                 list_del(&pg->list);
218                 free_priority_group(pg, m->ti);
219         }
220
221         kfree(m->hw_handler_name);
222         kfree(m->hw_handler_params);
223         mempool_destroy(m->mpio_pool);
224         kfree(m);
225 }
226
227 static int set_mapinfo(struct multipath *m, union map_info *info)
228 {
229         struct dm_mpath_io *mpio;
230
231         mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
232         if (!mpio)
233                 return -ENOMEM;
234
235         memset(mpio, 0, sizeof(*mpio));
236         info->ptr = mpio;
237
238         return 0;
239 }
240
241 static void clear_mapinfo(struct multipath *m, union map_info *info)
242 {
243         struct dm_mpath_io *mpio = info->ptr;
244
245         info->ptr = NULL;
246         mempool_free(mpio, m->mpio_pool);
247 }
248
249 /*-----------------------------------------------
250  * Path selection
251  *-----------------------------------------------*/
252
253 static int __pg_init_all_paths(struct multipath *m)
254 {
255         struct pgpath *pgpath;
256         unsigned long pg_init_delay = 0;
257
258         if (m->pg_init_in_progress || m->pg_init_disabled)
259                 return 0;
260
261         m->pg_init_count++;
262         m->pg_init_required = 0;
263
264         /* Check here to reset pg_init_required */
265         if (!m->current_pg)
266                 return 0;
267
268         if (m->pg_init_delay_retry)
269                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
270                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
271         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
272                 /* Skip failed paths */
273                 if (!pgpath->is_active)
274                         continue;
275                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
276                                        pg_init_delay))
277                         m->pg_init_in_progress++;
278         }
279         return m->pg_init_in_progress;
280 }
281
282 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
283 {
284         m->current_pg = pgpath->pg;
285
286         /* Must we initialise the PG first, and queue I/O till it's ready? */
287         if (m->hw_handler_name) {
288                 m->pg_init_required = 1;
289                 m->queue_io = 1;
290         } else {
291                 m->pg_init_required = 0;
292                 m->queue_io = 0;
293         }
294
295         m->pg_init_count = 0;
296 }
297
298 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
299                                size_t nr_bytes)
300 {
301         struct dm_path *path;
302
303         path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
304         if (!path)
305                 return -ENXIO;
306
307         m->current_pgpath = path_to_pgpath(path);
308
309         if (m->current_pg != pg)
310                 __switch_pg(m, m->current_pgpath);
311
312         return 0;
313 }
314
315 static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
316 {
317         struct priority_group *pg;
318         unsigned bypassed = 1;
319
320         if (!m->nr_valid_paths)
321                 goto failed;
322
323         /* Were we instructed to switch PG? */
324         if (m->next_pg) {
325                 pg = m->next_pg;
326                 m->next_pg = NULL;
327                 if (!__choose_path_in_pg(m, pg, nr_bytes))
328                         return;
329         }
330
331         /* Don't change PG until it has no remaining paths */
332         if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
333                 return;
334
335         /*
336          * Loop through priority groups until we find a valid path.
337          * First time we skip PGs marked 'bypassed'.
338          * Second time we only try the ones we skipped, but set
339          * pg_init_delay_retry so we do not hammer controllers.
340          */
341         do {
342                 list_for_each_entry(pg, &m->priority_groups, list) {
343                         if (pg->bypassed == bypassed)
344                                 continue;
345                         if (!__choose_path_in_pg(m, pg, nr_bytes)) {
346                                 if (!bypassed)
347                                         m->pg_init_delay_retry = 1;
348                                 return;
349                         }
350                 }
351         } while (bypassed--);
352
353 failed:
354         m->current_pgpath = NULL;
355         m->current_pg = NULL;
356 }
357
358 /*
359  * Check whether bios must be queued in the device-mapper core rather
360  * than here in the target.
361  *
362  * m->lock must be held on entry.
363  *
364  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
365  * same value then we are not between multipath_presuspend()
366  * and multipath_resume() calls and we have no need to check
367  * for the DMF_NOFLUSH_SUSPENDING flag.
368  */
369 static int __must_push_back(struct multipath *m)
370 {
371         return (m->queue_if_no_path ||
372                 (m->queue_if_no_path != m->saved_queue_if_no_path &&
373                  dm_noflush_suspending(m->ti)));
374 }
375
376 #define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
377
378 static int map_io(struct multipath *m, struct request *clone,
379                   union map_info *map_context)
380 {
381         int r = DM_MAPIO_REQUEUE;
382         size_t nr_bytes = blk_rq_bytes(clone);
383         unsigned long flags;
384         struct pgpath *pgpath;
385         struct block_device *bdev;
386         struct dm_mpath_io *mpio;
387
388         spin_lock_irqsave(&m->lock, flags);
389
390         /* Do we need to select a new pgpath? */
391         if (!m->current_pgpath ||
392             (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
393                 __choose_pgpath(m, nr_bytes);
394
395         pgpath = m->current_pgpath;
396
397         if (pgpath) {
398                 if (pg_ready(m)) {
399                         if (set_mapinfo(m, map_context) < 0)
400                                 /* ENOMEM, requeue */
401                                 goto out_unlock;
402
403                         bdev = pgpath->path.dev->bdev;
404                         clone->q = bdev_get_queue(bdev);
405                         clone->rq_disk = bdev->bd_disk;
406                         clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
407                         mpio = map_context->ptr;
408                         mpio->pgpath = pgpath;
409                         mpio->nr_bytes = nr_bytes;
410                         if (pgpath->pg->ps.type->start_io)
411                                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
412                                                               &pgpath->path,
413                                                               nr_bytes);
414                         r = DM_MAPIO_REMAPPED;
415                         goto out_unlock;
416                 }
417                 __pg_init_all_paths(m);
418         } else if (!__must_push_back(m))
419                 r = -EIO;       /* Failed */
420
421 out_unlock:
422         spin_unlock_irqrestore(&m->lock, flags);
423
424         return r;
425 }
426
427 /*
428  * If we run out of usable paths, should we queue I/O or error it?
429  */
430 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
431                             unsigned save_old_value)
432 {
433         unsigned long flags;
434
435         spin_lock_irqsave(&m->lock, flags);
436
437         if (save_old_value)
438                 m->saved_queue_if_no_path = m->queue_if_no_path;
439         else
440                 m->saved_queue_if_no_path = queue_if_no_path;
441         m->queue_if_no_path = queue_if_no_path;
442         if (!m->queue_if_no_path)
443                 dm_table_run_md_queue_async(m->ti->table);
444
445         spin_unlock_irqrestore(&m->lock, flags);
446
447         return 0;
448 }
449
450 /*
451  * An event is triggered whenever a path is taken out of use.
452  * Includes path failure and PG bypass.
453  */
454 static void trigger_event(struct work_struct *work)
455 {
456         struct multipath *m =
457                 container_of(work, struct multipath, trigger_event);
458
459         dm_table_event(m->ti->table);
460 }
461
462 /*-----------------------------------------------------------------
463  * Constructor/argument parsing:
464  * <#multipath feature args> [<arg>]*
465  * <#hw_handler args> [hw_handler [<arg>]*]
466  * <#priority groups>
467  * <initial priority group>
468  *     [<selector> <#selector args> [<arg>]*
469  *      <#paths> <#per-path selector args>
470  *         [<path> [<arg>]* ]+ ]+
471  *---------------------------------------------------------------*/
472 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
473                                struct dm_target *ti)
474 {
475         int r;
476         struct path_selector_type *pst;
477         unsigned ps_argc;
478
479         static struct dm_arg _args[] = {
480                 {0, 1024, "invalid number of path selector args"},
481         };
482
483         pst = dm_get_path_selector(dm_shift_arg(as));
484         if (!pst) {
485                 ti->error = "unknown path selector type";
486                 return -EINVAL;
487         }
488
489         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
490         if (r) {
491                 dm_put_path_selector(pst);
492                 return -EINVAL;
493         }
494
495         r = pst->create(&pg->ps, ps_argc, as->argv);
496         if (r) {
497                 dm_put_path_selector(pst);
498                 ti->error = "path selector constructor failed";
499                 return r;
500         }
501
502         pg->ps.type = pst;
503         dm_consume_args(as, ps_argc);
504
505         return 0;
506 }
507
508 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
509                                struct dm_target *ti)
510 {
511         int r;
512         struct pgpath *p;
513         struct multipath *m = ti->private;
514         struct request_queue *q = NULL;
515         const char *attached_handler_name;
516
517         /* we need at least a path arg */
518         if (as->argc < 1) {
519                 ti->error = "no device given";
520                 return ERR_PTR(-EINVAL);
521         }
522
523         p = alloc_pgpath();
524         if (!p)
525                 return ERR_PTR(-ENOMEM);
526
527         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
528                           &p->path.dev);
529         if (r) {
530                 ti->error = "error getting device";
531                 goto bad;
532         }
533
534         if (m->retain_attached_hw_handler || m->hw_handler_name)
535                 q = bdev_get_queue(p->path.dev->bdev);
536
537         if (m->retain_attached_hw_handler) {
538                 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
539                 if (attached_handler_name) {
540                         /*
541                          * Reset hw_handler_name to match the attached handler
542                          * and clear any hw_handler_params associated with the
543                          * ignored handler.
544                          *
545                          * NB. This modifies the table line to show the actual
546                          * handler instead of the original table passed in.
547                          */
548                         kfree(m->hw_handler_name);
549                         m->hw_handler_name = attached_handler_name;
550
551                         kfree(m->hw_handler_params);
552                         m->hw_handler_params = NULL;
553                 }
554         }
555
556         if (m->hw_handler_name) {
557                 /*
558                  * Increments scsi_dh reference, even when using an
559                  * already-attached handler.
560                  */
561                 r = scsi_dh_attach(q, m->hw_handler_name);
562                 if (r == -EBUSY) {
563                         /*
564                          * Already attached to different hw_handler:
565                          * try to reattach with correct one.
566                          */
567                         scsi_dh_detach(q);
568                         r = scsi_dh_attach(q, m->hw_handler_name);
569                 }
570
571                 if (r < 0) {
572                         ti->error = "error attaching hardware handler";
573                         dm_put_device(ti, p->path.dev);
574                         goto bad;
575                 }
576
577                 if (m->hw_handler_params) {
578                         r = scsi_dh_set_params(q, m->hw_handler_params);
579                         if (r < 0) {
580                                 ti->error = "unable to set hardware "
581                                                         "handler parameters";
582                                 scsi_dh_detach(q);
583                                 dm_put_device(ti, p->path.dev);
584                                 goto bad;
585                         }
586                 }
587         }
588
589         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
590         if (r) {
591                 dm_put_device(ti, p->path.dev);
592                 goto bad;
593         }
594
595         return p;
596
597  bad:
598         free_pgpath(p);
599         return ERR_PTR(r);
600 }
601
602 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
603                                                    struct multipath *m)
604 {
605         static struct dm_arg _args[] = {
606                 {1, 1024, "invalid number of paths"},
607                 {0, 1024, "invalid number of selector args"}
608         };
609
610         int r;
611         unsigned i, nr_selector_args, nr_args;
612         struct priority_group *pg;
613         struct dm_target *ti = m->ti;
614
615         if (as->argc < 2) {
616                 as->argc = 0;
617                 ti->error = "not enough priority group arguments";
618                 return ERR_PTR(-EINVAL);
619         }
620
621         pg = alloc_priority_group();
622         if (!pg) {
623                 ti->error = "couldn't allocate priority group";
624                 return ERR_PTR(-ENOMEM);
625         }
626         pg->m = m;
627
628         r = parse_path_selector(as, pg, ti);
629         if (r)
630                 goto bad;
631
632         /*
633          * read the paths
634          */
635         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
636         if (r)
637                 goto bad;
638
639         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
640         if (r)
641                 goto bad;
642
643         nr_args = 1 + nr_selector_args;
644         for (i = 0; i < pg->nr_pgpaths; i++) {
645                 struct pgpath *pgpath;
646                 struct dm_arg_set path_args;
647
648                 if (as->argc < nr_args) {
649                         ti->error = "not enough path parameters";
650                         r = -EINVAL;
651                         goto bad;
652                 }
653
654                 path_args.argc = nr_args;
655                 path_args.argv = as->argv;
656
657                 pgpath = parse_path(&path_args, &pg->ps, ti);
658                 if (IS_ERR(pgpath)) {
659                         r = PTR_ERR(pgpath);
660                         goto bad;
661                 }
662
663                 pgpath->pg = pg;
664                 list_add_tail(&pgpath->list, &pg->pgpaths);
665                 dm_consume_args(as, nr_args);
666         }
667
668         return pg;
669
670  bad:
671         free_priority_group(pg, ti);
672         return ERR_PTR(r);
673 }
674
675 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
676 {
677         unsigned hw_argc;
678         int ret;
679         struct dm_target *ti = m->ti;
680
681         static struct dm_arg _args[] = {
682                 {0, 1024, "invalid number of hardware handler args"},
683         };
684
685         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
686                 return -EINVAL;
687
688         if (!hw_argc)
689                 return 0;
690
691         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
692         if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
693                                      "scsi_dh_%s", m->hw_handler_name)) {
694                 ti->error = "unknown hardware handler type";
695                 ret = -EINVAL;
696                 goto fail;
697         }
698
699         if (hw_argc > 1) {
700                 char *p;
701                 int i, j, len = 4;
702
703                 for (i = 0; i <= hw_argc - 2; i++)
704                         len += strlen(as->argv[i]) + 1;
705                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
706                 if (!p) {
707                         ti->error = "memory allocation failed";
708                         ret = -ENOMEM;
709                         goto fail;
710                 }
711                 j = sprintf(p, "%d", hw_argc - 1);
712                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
713                         j = sprintf(p, "%s", as->argv[i]);
714         }
715         dm_consume_args(as, hw_argc - 1);
716
717         return 0;
718 fail:
719         kfree(m->hw_handler_name);
720         m->hw_handler_name = NULL;
721         return ret;
722 }
723
724 static int parse_features(struct dm_arg_set *as, struct multipath *m)
725 {
726         int r;
727         unsigned argc;
728         struct dm_target *ti = m->ti;
729         const char *arg_name;
730
731         static struct dm_arg _args[] = {
732                 {0, 6, "invalid number of feature args"},
733                 {1, 50, "pg_init_retries must be between 1 and 50"},
734                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
735         };
736
737         r = dm_read_arg_group(_args, as, &argc, &ti->error);
738         if (r)
739                 return -EINVAL;
740
741         if (!argc)
742                 return 0;
743
744         do {
745                 arg_name = dm_shift_arg(as);
746                 argc--;
747
748                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
749                         r = queue_if_no_path(m, 1, 0);
750                         continue;
751                 }
752
753                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
754                         m->retain_attached_hw_handler = 1;
755                         continue;
756                 }
757
758                 if (!strcasecmp(arg_name, "pg_init_retries") &&
759                     (argc >= 1)) {
760                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
761                         argc--;
762                         continue;
763                 }
764
765                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
766                     (argc >= 1)) {
767                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
768                         argc--;
769                         continue;
770                 }
771
772                 ti->error = "Unrecognised multipath feature request";
773                 r = -EINVAL;
774         } while (argc && !r);
775
776         return r;
777 }
778
779 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
780                          char **argv)
781 {
782         /* target arguments */
783         static struct dm_arg _args[] = {
784                 {0, 1024, "invalid number of priority groups"},
785                 {0, 1024, "invalid initial priority group number"},
786         };
787
788         int r;
789         struct multipath *m;
790         struct dm_arg_set as;
791         unsigned pg_count = 0;
792         unsigned next_pg_num;
793
794         as.argc = argc;
795         as.argv = argv;
796
797         m = alloc_multipath(ti);
798         if (!m) {
799                 ti->error = "can't allocate multipath";
800                 return -EINVAL;
801         }
802
803         r = parse_features(&as, m);
804         if (r)
805                 goto bad;
806
807         r = parse_hw_handler(&as, m);
808         if (r)
809                 goto bad;
810
811         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
812         if (r)
813                 goto bad;
814
815         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
816         if (r)
817                 goto bad;
818
819         if ((!m->nr_priority_groups && next_pg_num) ||
820             (m->nr_priority_groups && !next_pg_num)) {
821                 ti->error = "invalid initial priority group";
822                 r = -EINVAL;
823                 goto bad;
824         }
825
826         /* parse the priority groups */
827         while (as.argc) {
828                 struct priority_group *pg;
829
830                 pg = parse_priority_group(&as, m);
831                 if (IS_ERR(pg)) {
832                         r = PTR_ERR(pg);
833                         goto bad;
834                 }
835
836                 m->nr_valid_paths += pg->nr_pgpaths;
837                 list_add_tail(&pg->list, &m->priority_groups);
838                 pg_count++;
839                 pg->pg_num = pg_count;
840                 if (!--next_pg_num)
841                         m->next_pg = pg;
842         }
843
844         if (pg_count != m->nr_priority_groups) {
845                 ti->error = "priority group count mismatch";
846                 r = -EINVAL;
847                 goto bad;
848         }
849
850         ti->num_flush_bios = 1;
851         ti->num_discard_bios = 1;
852         ti->num_write_same_bios = 1;
853
854         return 0;
855
856  bad:
857         free_multipath(m);
858         return r;
859 }
860
861 static void multipath_wait_for_pg_init_completion(struct multipath *m)
862 {
863         DECLARE_WAITQUEUE(wait, current);
864         unsigned long flags;
865
866         add_wait_queue(&m->pg_init_wait, &wait);
867
868         while (1) {
869                 set_current_state(TASK_UNINTERRUPTIBLE);
870
871                 spin_lock_irqsave(&m->lock, flags);
872                 if (!m->pg_init_in_progress) {
873                         spin_unlock_irqrestore(&m->lock, flags);
874                         break;
875                 }
876                 spin_unlock_irqrestore(&m->lock, flags);
877
878                 io_schedule();
879         }
880         set_current_state(TASK_RUNNING);
881
882         remove_wait_queue(&m->pg_init_wait, &wait);
883 }
884
885 static void flush_multipath_work(struct multipath *m)
886 {
887         unsigned long flags;
888
889         spin_lock_irqsave(&m->lock, flags);
890         m->pg_init_disabled = 1;
891         spin_unlock_irqrestore(&m->lock, flags);
892
893         flush_workqueue(kmpath_handlerd);
894         multipath_wait_for_pg_init_completion(m);
895         flush_workqueue(kmultipathd);
896         flush_work(&m->trigger_event);
897
898         spin_lock_irqsave(&m->lock, flags);
899         m->pg_init_disabled = 0;
900         spin_unlock_irqrestore(&m->lock, flags);
901 }
902
903 static void multipath_dtr(struct dm_target *ti)
904 {
905         struct multipath *m = ti->private;
906
907         flush_multipath_work(m);
908         free_multipath(m);
909 }
910
911 /*
912  * Map cloned requests
913  */
914 static int multipath_map(struct dm_target *ti, struct request *clone,
915                          union map_info *map_context)
916 {
917         struct multipath *m = (struct multipath *) ti->private;
918
919         return map_io(m, clone, map_context);
920 }
921
922 /*
923  * Take a path out of use.
924  */
925 static int fail_path(struct pgpath *pgpath)
926 {
927         unsigned long flags;
928         struct multipath *m = pgpath->pg->m;
929
930         spin_lock_irqsave(&m->lock, flags);
931
932         if (!pgpath->is_active)
933                 goto out;
934
935         DMWARN("Failing path %s.", pgpath->path.dev->name);
936
937         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
938         pgpath->is_active = 0;
939         pgpath->fail_count++;
940
941         m->nr_valid_paths--;
942
943         if (pgpath == m->current_pgpath)
944                 m->current_pgpath = NULL;
945
946         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
947                       pgpath->path.dev->name, m->nr_valid_paths);
948
949         schedule_work(&m->trigger_event);
950
951 out:
952         spin_unlock_irqrestore(&m->lock, flags);
953
954         return 0;
955 }
956
957 /*
958  * Reinstate a previously-failed path
959  */
960 static int reinstate_path(struct pgpath *pgpath)
961 {
962         int r = 0;
963         unsigned long flags;
964         struct multipath *m = pgpath->pg->m;
965
966         spin_lock_irqsave(&m->lock, flags);
967
968         if (pgpath->is_active)
969                 goto out;
970
971         if (!pgpath->pg->ps.type->reinstate_path) {
972                 DMWARN("Reinstate path not supported by path selector %s",
973                        pgpath->pg->ps.type->name);
974                 r = -EINVAL;
975                 goto out;
976         }
977
978         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
979         if (r)
980                 goto out;
981
982         pgpath->is_active = 1;
983
984         if (!m->nr_valid_paths++) {
985                 m->current_pgpath = NULL;
986                 dm_table_run_md_queue_async(m->ti->table);
987         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
988                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
989                         m->pg_init_in_progress++;
990         }
991
992         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
993                       pgpath->path.dev->name, m->nr_valid_paths);
994
995         schedule_work(&m->trigger_event);
996
997 out:
998         spin_unlock_irqrestore(&m->lock, flags);
999
1000         return r;
1001 }
1002
1003 /*
1004  * Fail or reinstate all paths that match the provided struct dm_dev.
1005  */
1006 static int action_dev(struct multipath *m, struct dm_dev *dev,
1007                       action_fn action)
1008 {
1009         int r = -EINVAL;
1010         struct pgpath *pgpath;
1011         struct priority_group *pg;
1012
1013         list_for_each_entry(pg, &m->priority_groups, list) {
1014                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1015                         if (pgpath->path.dev == dev)
1016                                 r = action(pgpath);
1017                 }
1018         }
1019
1020         return r;
1021 }
1022
1023 /*
1024  * Temporarily try to avoid having to use the specified PG
1025  */
1026 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1027                       int bypassed)
1028 {
1029         unsigned long flags;
1030
1031         spin_lock_irqsave(&m->lock, flags);
1032
1033         pg->bypassed = bypassed;
1034         m->current_pgpath = NULL;
1035         m->current_pg = NULL;
1036
1037         spin_unlock_irqrestore(&m->lock, flags);
1038
1039         schedule_work(&m->trigger_event);
1040 }
1041
1042 /*
1043  * Switch to using the specified PG from the next I/O that gets mapped
1044  */
1045 static int switch_pg_num(struct multipath *m, const char *pgstr)
1046 {
1047         struct priority_group *pg;
1048         unsigned pgnum;
1049         unsigned long flags;
1050         char dummy;
1051
1052         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1053             (pgnum > m->nr_priority_groups)) {
1054                 DMWARN("invalid PG number supplied to switch_pg_num");
1055                 return -EINVAL;
1056         }
1057
1058         spin_lock_irqsave(&m->lock, flags);
1059         list_for_each_entry(pg, &m->priority_groups, list) {
1060                 pg->bypassed = 0;
1061                 if (--pgnum)
1062                         continue;
1063
1064                 m->current_pgpath = NULL;
1065                 m->current_pg = NULL;
1066                 m->next_pg = pg;
1067         }
1068         spin_unlock_irqrestore(&m->lock, flags);
1069
1070         schedule_work(&m->trigger_event);
1071         return 0;
1072 }
1073
1074 /*
1075  * Set/clear bypassed status of a PG.
1076  * PGs are numbered upwards from 1 in the order they were declared.
1077  */
1078 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1079 {
1080         struct priority_group *pg;
1081         unsigned pgnum;
1082         char dummy;
1083
1084         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1085             (pgnum > m->nr_priority_groups)) {
1086                 DMWARN("invalid PG number supplied to bypass_pg");
1087                 return -EINVAL;
1088         }
1089
1090         list_for_each_entry(pg, &m->priority_groups, list) {
1091                 if (!--pgnum)
1092                         break;
1093         }
1094
1095         bypass_pg(m, pg, bypassed);
1096         return 0;
1097 }
1098
1099 /*
1100  * Should we retry pg_init immediately?
1101  */
1102 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1103 {
1104         unsigned long flags;
1105         int limit_reached = 0;
1106
1107         spin_lock_irqsave(&m->lock, flags);
1108
1109         if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
1110                 m->pg_init_required = 1;
1111         else
1112                 limit_reached = 1;
1113
1114         spin_unlock_irqrestore(&m->lock, flags);
1115
1116         return limit_reached;
1117 }
1118
1119 static void pg_init_done(void *data, int errors)
1120 {
1121         struct pgpath *pgpath = data;
1122         struct priority_group *pg = pgpath->pg;
1123         struct multipath *m = pg->m;
1124         unsigned long flags;
1125         unsigned delay_retry = 0;
1126
1127         /* device or driver problems */
1128         switch (errors) {
1129         case SCSI_DH_OK:
1130                 break;
1131         case SCSI_DH_NOSYS:
1132                 if (!m->hw_handler_name) {
1133                         errors = 0;
1134                         break;
1135                 }
1136                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1137                       "Error %d.", m->hw_handler_name, errors);
1138                 /*
1139                  * Fail path for now, so we do not ping pong
1140                  */
1141                 fail_path(pgpath);
1142                 break;
1143         case SCSI_DH_DEV_TEMP_BUSY:
1144                 /*
1145                  * Probably doing something like FW upgrade on the
1146                  * controller so try the other pg.
1147                  */
1148                 bypass_pg(m, pg, 1);
1149                 break;
1150         case SCSI_DH_RETRY:
1151                 /* Wait before retrying. */
1152                 delay_retry = 1;
1153         case SCSI_DH_IMM_RETRY:
1154         case SCSI_DH_RES_TEMP_UNAVAIL:
1155                 if (pg_init_limit_reached(m, pgpath))
1156                         fail_path(pgpath);
1157                 errors = 0;
1158                 break;
1159         default:
1160                 /*
1161                  * We probably do not want to fail the path for a device
1162                  * error, but this is what the old dm did. In future
1163                  * patches we can do more advanced handling.
1164                  */
1165                 fail_path(pgpath);
1166         }
1167
1168         spin_lock_irqsave(&m->lock, flags);
1169         if (errors) {
1170                 if (pgpath == m->current_pgpath) {
1171                         DMERR("Could not failover device. Error %d.", errors);
1172                         m->current_pgpath = NULL;
1173                         m->current_pg = NULL;
1174                 }
1175         } else if (!m->pg_init_required)
1176                 pg->bypassed = 0;
1177
1178         if (--m->pg_init_in_progress)
1179                 /* Activations of other paths are still on going */
1180                 goto out;
1181
1182         if (m->pg_init_required) {
1183                 m->pg_init_delay_retry = delay_retry;
1184                 if (__pg_init_all_paths(m))
1185                         goto out;
1186         }
1187         m->queue_io = 0;
1188
1189         /*
1190          * Wake up any thread waiting to suspend.
1191          */
1192         wake_up(&m->pg_init_wait);
1193
1194 out:
1195         spin_unlock_irqrestore(&m->lock, flags);
1196 }
1197
1198 static void activate_path(struct work_struct *work)
1199 {
1200         struct pgpath *pgpath =
1201                 container_of(work, struct pgpath, activate_path.work);
1202
1203         scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1204                                 pg_init_done, pgpath);
1205 }
1206
1207 static int noretry_error(int error)
1208 {
1209         switch (error) {
1210         case -EOPNOTSUPP:
1211         case -EREMOTEIO:
1212         case -EILSEQ:
1213         case -ENODATA:
1214         case -ENOSPC:
1215                 return 1;
1216         }
1217
1218         /* Anything else could be a path failure, so should be retried */
1219         return 0;
1220 }
1221
1222 /*
1223  * end_io handling
1224  */
1225 static int do_end_io(struct multipath *m, struct request *clone,
1226                      int error, struct dm_mpath_io *mpio)
1227 {
1228         /*
1229          * We don't queue any clone request inside the multipath target
1230          * during end I/O handling, since those clone requests don't have
1231          * bio clones.  If we queue them inside the multipath target,
1232          * we need to make bio clones, that requires memory allocation.
1233          * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1234          *  don't have bio clones.)
1235          * Instead of queueing the clone request here, we queue the original
1236          * request into dm core, which will remake a clone request and
1237          * clone bios for it and resubmit it later.
1238          */
1239         int r = DM_ENDIO_REQUEUE;
1240         unsigned long flags;
1241
1242         if (!error && !clone->errors)
1243                 return 0;       /* I/O complete */
1244
1245         if (noretry_error(error)) {
1246                 if ((clone->cmd_flags & REQ_WRITE_SAME) &&
1247                     !clone->q->limits.max_write_same_sectors) {
1248                         struct queue_limits *limits;
1249
1250                         /* device doesn't really support WRITE SAME, disable it */
1251                         limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
1252                         limits->max_write_same_sectors = 0;
1253                 }
1254                 return error;
1255         }
1256
1257         if (mpio->pgpath)
1258                 fail_path(mpio->pgpath);
1259
1260         spin_lock_irqsave(&m->lock, flags);
1261         if (!m->nr_valid_paths) {
1262                 if (!m->queue_if_no_path) {
1263                         if (!__must_push_back(m))
1264                                 r = -EIO;
1265                 } else {
1266                         if (error == -EBADE)
1267                                 r = error;
1268                 }
1269         }
1270         spin_unlock_irqrestore(&m->lock, flags);
1271
1272         return r;
1273 }
1274
1275 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1276                             int error, union map_info *map_context)
1277 {
1278         struct multipath *m = ti->private;
1279         struct dm_mpath_io *mpio = map_context->ptr;
1280         struct pgpath *pgpath;
1281         struct path_selector *ps;
1282         int r;
1283
1284         BUG_ON(!mpio);
1285
1286         r  = do_end_io(m, clone, error, mpio);
1287         pgpath = mpio->pgpath;
1288         if (pgpath) {
1289                 ps = &pgpath->pg->ps;
1290                 if (ps->type->end_io)
1291                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1292         }
1293         clear_mapinfo(m, map_context);
1294
1295         return r;
1296 }
1297
1298 /*
1299  * Suspend can't complete until all the I/O is processed so if
1300  * the last path fails we must error any remaining I/O.
1301  * Note that if the freeze_bdev fails while suspending, the
1302  * queue_if_no_path state is lost - userspace should reset it.
1303  */
1304 static void multipath_presuspend(struct dm_target *ti)
1305 {
1306         struct multipath *m = (struct multipath *) ti->private;
1307
1308         queue_if_no_path(m, 0, 1);
1309 }
1310
1311 static void multipath_postsuspend(struct dm_target *ti)
1312 {
1313         struct multipath *m = ti->private;
1314
1315         mutex_lock(&m->work_mutex);
1316         flush_multipath_work(m);
1317         mutex_unlock(&m->work_mutex);
1318 }
1319
1320 /*
1321  * Restore the queue_if_no_path setting.
1322  */
1323 static void multipath_resume(struct dm_target *ti)
1324 {
1325         struct multipath *m = (struct multipath *) ti->private;
1326         unsigned long flags;
1327
1328         spin_lock_irqsave(&m->lock, flags);
1329         m->queue_if_no_path = m->saved_queue_if_no_path;
1330         spin_unlock_irqrestore(&m->lock, flags);
1331 }
1332
1333 /*
1334  * Info output has the following format:
1335  * num_multipath_feature_args [multipath_feature_args]*
1336  * num_handler_status_args [handler_status_args]*
1337  * num_groups init_group_number
1338  *            [A|D|E num_ps_status_args [ps_status_args]*
1339  *             num_paths num_selector_args
1340  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1341  *
1342  * Table output has the following format (identical to the constructor string):
1343  * num_feature_args [features_args]*
1344  * num_handler_args hw_handler [hw_handler_args]*
1345  * num_groups init_group_number
1346  *     [priority selector-name num_ps_args [ps_args]*
1347  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1348  */
1349 static void multipath_status(struct dm_target *ti, status_type_t type,
1350                              unsigned status_flags, char *result, unsigned maxlen)
1351 {
1352         int sz = 0;
1353         unsigned long flags;
1354         struct multipath *m = (struct multipath *) ti->private;
1355         struct priority_group *pg;
1356         struct pgpath *p;
1357         unsigned pg_num;
1358         char state;
1359
1360         spin_lock_irqsave(&m->lock, flags);
1361
1362         /* Features */
1363         if (type == STATUSTYPE_INFO)
1364                 DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
1365         else {
1366                 DMEMIT("%u ", m->queue_if_no_path +
1367                               (m->pg_init_retries > 0) * 2 +
1368                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1369                               m->retain_attached_hw_handler);
1370                 if (m->queue_if_no_path)
1371                         DMEMIT("queue_if_no_path ");
1372                 if (m->pg_init_retries)
1373                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1374                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1375                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1376                 if (m->retain_attached_hw_handler)
1377                         DMEMIT("retain_attached_hw_handler ");
1378         }
1379
1380         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1381                 DMEMIT("0 ");
1382         else
1383                 DMEMIT("1 %s ", m->hw_handler_name);
1384
1385         DMEMIT("%u ", m->nr_priority_groups);
1386
1387         if (m->next_pg)
1388                 pg_num = m->next_pg->pg_num;
1389         else if (m->current_pg)
1390                 pg_num = m->current_pg->pg_num;
1391         else
1392                 pg_num = (m->nr_priority_groups ? 1 : 0);
1393
1394         DMEMIT("%u ", pg_num);
1395
1396         switch (type) {
1397         case STATUSTYPE_INFO:
1398                 list_for_each_entry(pg, &m->priority_groups, list) {
1399                         if (pg->bypassed)
1400                                 state = 'D';    /* Disabled */
1401                         else if (pg == m->current_pg)
1402                                 state = 'A';    /* Currently Active */
1403                         else
1404                                 state = 'E';    /* Enabled */
1405
1406                         DMEMIT("%c ", state);
1407
1408                         if (pg->ps.type->status)
1409                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1410                                                           result + sz,
1411                                                           maxlen - sz);
1412                         else
1413                                 DMEMIT("0 ");
1414
1415                         DMEMIT("%u %u ", pg->nr_pgpaths,
1416                                pg->ps.type->info_args);
1417
1418                         list_for_each_entry(p, &pg->pgpaths, list) {
1419                                 DMEMIT("%s %s %u ", p->path.dev->name,
1420                                        p->is_active ? "A" : "F",
1421                                        p->fail_count);
1422                                 if (pg->ps.type->status)
1423                                         sz += pg->ps.type->status(&pg->ps,
1424                                               &p->path, type, result + sz,
1425                                               maxlen - sz);
1426                         }
1427                 }
1428                 break;
1429
1430         case STATUSTYPE_TABLE:
1431                 list_for_each_entry(pg, &m->priority_groups, list) {
1432                         DMEMIT("%s ", pg->ps.type->name);
1433
1434                         if (pg->ps.type->status)
1435                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1436                                                           result + sz,
1437                                                           maxlen - sz);
1438                         else
1439                                 DMEMIT("0 ");
1440
1441                         DMEMIT("%u %u ", pg->nr_pgpaths,
1442                                pg->ps.type->table_args);
1443
1444                         list_for_each_entry(p, &pg->pgpaths, list) {
1445                                 DMEMIT("%s ", p->path.dev->name);
1446                                 if (pg->ps.type->status)
1447                                         sz += pg->ps.type->status(&pg->ps,
1448                                               &p->path, type, result + sz,
1449                                               maxlen - sz);
1450                         }
1451                 }
1452                 break;
1453         }
1454
1455         spin_unlock_irqrestore(&m->lock, flags);
1456 }
1457
1458 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1459 {
1460         int r = -EINVAL;
1461         struct dm_dev *dev;
1462         struct multipath *m = (struct multipath *) ti->private;
1463         action_fn action;
1464
1465         mutex_lock(&m->work_mutex);
1466
1467         if (dm_suspended(ti)) {
1468                 r = -EBUSY;
1469                 goto out;
1470         }
1471
1472         if (argc == 1) {
1473                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1474                         r = queue_if_no_path(m, 1, 0);
1475                         goto out;
1476                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1477                         r = queue_if_no_path(m, 0, 0);
1478                         goto out;
1479                 }
1480         }
1481
1482         if (argc != 2) {
1483                 DMWARN("Unrecognised multipath message received.");
1484                 goto out;
1485         }
1486
1487         if (!strcasecmp(argv[0], "disable_group")) {
1488                 r = bypass_pg_num(m, argv[1], 1);
1489                 goto out;
1490         } else if (!strcasecmp(argv[0], "enable_group")) {
1491                 r = bypass_pg_num(m, argv[1], 0);
1492                 goto out;
1493         } else if (!strcasecmp(argv[0], "switch_group")) {
1494                 r = switch_pg_num(m, argv[1]);
1495                 goto out;
1496         } else if (!strcasecmp(argv[0], "reinstate_path"))
1497                 action = reinstate_path;
1498         else if (!strcasecmp(argv[0], "fail_path"))
1499                 action = fail_path;
1500         else {
1501                 DMWARN("Unrecognised multipath message received.");
1502                 goto out;
1503         }
1504
1505         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1506         if (r) {
1507                 DMWARN("message: error getting device %s",
1508                        argv[1]);
1509                 goto out;
1510         }
1511
1512         r = action_dev(m, dev, action);
1513
1514         dm_put_device(ti, dev);
1515
1516 out:
1517         mutex_unlock(&m->work_mutex);
1518         return r;
1519 }
1520
1521 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1522                            unsigned long arg)
1523 {
1524         struct multipath *m = ti->private;
1525         struct pgpath *pgpath;
1526         struct block_device *bdev;
1527         fmode_t mode;
1528         unsigned long flags;
1529         int r;
1530
1531         bdev = NULL;
1532         mode = 0;
1533         r = 0;
1534
1535         spin_lock_irqsave(&m->lock, flags);
1536
1537         if (!m->current_pgpath)
1538                 __choose_pgpath(m, 0);
1539
1540         pgpath = m->current_pgpath;
1541
1542         if (pgpath) {
1543                 bdev = pgpath->path.dev->bdev;
1544                 mode = pgpath->path.dev->mode;
1545         }
1546
1547         if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
1548                 r = -ENOTCONN;
1549         else if (!bdev)
1550                 r = -EIO;
1551
1552         spin_unlock_irqrestore(&m->lock, flags);
1553
1554         /*
1555          * Only pass ioctls through if the device sizes match exactly.
1556          */
1557         if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
1558                 int err = scsi_verify_blk_ioctl(NULL, cmd);
1559                 if (err)
1560                         r = err;
1561         }
1562
1563         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
1564                 spin_lock_irqsave(&m->lock, flags);
1565                 if (!m->current_pg) {
1566                         /* Path status changed, redo selection */
1567                         __choose_pgpath(m, 0);
1568                 }
1569                 if (m->pg_init_required)
1570                         __pg_init_all_paths(m);
1571                 spin_unlock_irqrestore(&m->lock, flags);
1572                 dm_table_run_md_queue_async(m->ti->table);
1573         }
1574
1575         return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1576 }
1577
1578 static int multipath_iterate_devices(struct dm_target *ti,
1579                                      iterate_devices_callout_fn fn, void *data)
1580 {
1581         struct multipath *m = ti->private;
1582         struct priority_group *pg;
1583         struct pgpath *p;
1584         int ret = 0;
1585
1586         list_for_each_entry(pg, &m->priority_groups, list) {
1587                 list_for_each_entry(p, &pg->pgpaths, list) {
1588                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1589                         if (ret)
1590                                 goto out;
1591                 }
1592         }
1593
1594 out:
1595         return ret;
1596 }
1597
1598 static int __pgpath_busy(struct pgpath *pgpath)
1599 {
1600         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1601
1602         return dm_underlying_device_busy(q);
1603 }
1604
1605 /*
1606  * We return "busy", only when we can map I/Os but underlying devices
1607  * are busy (so even if we map I/Os now, the I/Os will wait on
1608  * the underlying queue).
1609  * In other words, if we want to kill I/Os or queue them inside us
1610  * due to map unavailability, we don't return "busy".  Otherwise,
1611  * dm core won't give us the I/Os and we can't do what we want.
1612  */
1613 static int multipath_busy(struct dm_target *ti)
1614 {
1615         int busy = 0, has_active = 0;
1616         struct multipath *m = ti->private;
1617         struct priority_group *pg;
1618         struct pgpath *pgpath;
1619         unsigned long flags;
1620
1621         spin_lock_irqsave(&m->lock, flags);
1622
1623         /* pg_init in progress, requeue until done */
1624         if (!pg_ready(m)) {
1625                 busy = 1;
1626                 goto out;
1627         }
1628         /* Guess which priority_group will be used at next mapping time */
1629         if (unlikely(!m->current_pgpath && m->next_pg))
1630                 pg = m->next_pg;
1631         else if (likely(m->current_pg))
1632                 pg = m->current_pg;
1633         else
1634                 /*
1635                  * We don't know which pg will be used at next mapping time.
1636                  * We don't call __choose_pgpath() here to avoid to trigger
1637                  * pg_init just by busy checking.
1638                  * So we don't know whether underlying devices we will be using
1639                  * at next mapping time are busy or not. Just try mapping.
1640                  */
1641                 goto out;
1642
1643         /*
1644          * If there is one non-busy active path at least, the path selector
1645          * will be able to select it. So we consider such a pg as not busy.
1646          */
1647         busy = 1;
1648         list_for_each_entry(pgpath, &pg->pgpaths, list)
1649                 if (pgpath->is_active) {
1650                         has_active = 1;
1651
1652                         if (!__pgpath_busy(pgpath)) {
1653                                 busy = 0;
1654                                 break;
1655                         }
1656                 }
1657
1658         if (!has_active)
1659                 /*
1660                  * No active path in this pg, so this pg won't be used and
1661                  * the current_pg will be changed at next mapping time.
1662                  * We need to try mapping to determine it.
1663                  */
1664                 busy = 0;
1665
1666 out:
1667         spin_unlock_irqrestore(&m->lock, flags);
1668
1669         return busy;
1670 }
1671
1672 /*-----------------------------------------------------------------
1673  * Module setup
1674  *---------------------------------------------------------------*/
1675 static struct target_type multipath_target = {
1676         .name = "multipath",
1677         .version = {1, 7, 0},
1678         .module = THIS_MODULE,
1679         .ctr = multipath_ctr,
1680         .dtr = multipath_dtr,
1681         .map_rq = multipath_map,
1682         .rq_end_io = multipath_end_io,
1683         .presuspend = multipath_presuspend,
1684         .postsuspend = multipath_postsuspend,
1685         .resume = multipath_resume,
1686         .status = multipath_status,
1687         .message = multipath_message,
1688         .ioctl  = multipath_ioctl,
1689         .iterate_devices = multipath_iterate_devices,
1690         .busy = multipath_busy,
1691 };
1692
1693 static int __init dm_multipath_init(void)
1694 {
1695         int r;
1696
1697         /* allocate a slab for the dm_ios */
1698         _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1699         if (!_mpio_cache)
1700                 return -ENOMEM;
1701
1702         r = dm_register_target(&multipath_target);
1703         if (r < 0) {
1704                 DMERR("register failed %d", r);
1705                 kmem_cache_destroy(_mpio_cache);
1706                 return -EINVAL;
1707         }
1708
1709         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1710         if (!kmultipathd) {
1711                 DMERR("failed to create workqueue kmpathd");
1712                 dm_unregister_target(&multipath_target);
1713                 kmem_cache_destroy(_mpio_cache);
1714                 return -ENOMEM;
1715         }
1716
1717         /*
1718          * A separate workqueue is used to handle the device handlers
1719          * to avoid overloading existing workqueue. Overloading the
1720          * old workqueue would also create a bottleneck in the
1721          * path of the storage hardware device activation.
1722          */
1723         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1724                                                   WQ_MEM_RECLAIM);
1725         if (!kmpath_handlerd) {
1726                 DMERR("failed to create workqueue kmpath_handlerd");
1727                 destroy_workqueue(kmultipathd);
1728                 dm_unregister_target(&multipath_target);
1729                 kmem_cache_destroy(_mpio_cache);
1730                 return -ENOMEM;
1731         }
1732
1733         DMINFO("version %u.%u.%u loaded",
1734                multipath_target.version[0], multipath_target.version[1],
1735                multipath_target.version[2]);
1736
1737         return r;
1738 }
1739
1740 static void __exit dm_multipath_exit(void)
1741 {
1742         destroy_workqueue(kmpath_handlerd);
1743         destroy_workqueue(kmultipathd);
1744
1745         dm_unregister_target(&multipath_target);
1746         kmem_cache_destroy(_mpio_cache);
1747 }
1748
1749 module_init(dm_multipath_init);
1750 module_exit(dm_multipath_exit);
1751
1752 MODULE_DESCRIPTION(DM_NAME " multipath target");
1753 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1754 MODULE_LICENSE("GPL");