Merge tag 'hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck...
[pandora-kernel.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
430                 goto nla_put_failure;
431         nla_nest_end(skb, nest);
432
433         return skb->len;
434
435 nla_put_failure:
436         return -1;
437 }
438
439 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
440 {
441         int pkt_len, slot;
442
443         pkt_len = skb->len + stab->szopts.overhead;
444         if (unlikely(!stab->szopts.tsize))
445                 goto out;
446
447         slot = pkt_len + stab->szopts.cell_align;
448         if (unlikely(slot < 0))
449                 slot = 0;
450
451         slot >>= stab->szopts.cell_log;
452         if (likely(slot < stab->szopts.tsize))
453                 pkt_len = stab->data[slot];
454         else
455                 pkt_len = stab->data[stab->szopts.tsize - 1] *
456                                 (slot / stab->szopts.tsize) +
457                                 stab->data[slot % stab->szopts.tsize];
458
459         pkt_len <<= stab->szopts.size_log;
460 out:
461         if (unlikely(pkt_len < 1))
462                 pkt_len = 1;
463         qdisc_skb_cb(skb)->pkt_len = pkt_len;
464 }
465 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
466
467 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
468 {
469         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
470                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
471                         txt, qdisc->ops->id, qdisc->handle >> 16);
472                 qdisc->flags |= TCQ_F_WARN_NONWC;
473         }
474 }
475 EXPORT_SYMBOL(qdisc_warn_nonwc);
476
477 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
478 {
479         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
480                                                  timer);
481
482         qdisc_unthrottled(wd->qdisc);
483         __netif_schedule(qdisc_root(wd->qdisc));
484
485         return HRTIMER_NORESTART;
486 }
487
488 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
489 {
490         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
491         wd->timer.function = qdisc_watchdog;
492         wd->qdisc = qdisc;
493 }
494 EXPORT_SYMBOL(qdisc_watchdog_init);
495
496 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
497 {
498         ktime_t time;
499
500         if (test_bit(__QDISC_STATE_DEACTIVATED,
501                      &qdisc_root_sleeping(wd->qdisc)->state))
502                 return;
503
504         qdisc_throttled(wd->qdisc);
505         time = ktime_set(0, 0);
506         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
507         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
508 }
509 EXPORT_SYMBOL(qdisc_watchdog_schedule);
510
511 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
512 {
513         hrtimer_cancel(&wd->timer);
514         qdisc_unthrottled(wd->qdisc);
515 }
516 EXPORT_SYMBOL(qdisc_watchdog_cancel);
517
518 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519 {
520         unsigned int size = n * sizeof(struct hlist_head), i;
521         struct hlist_head *h;
522
523         if (size <= PAGE_SIZE)
524                 h = kmalloc(size, GFP_KERNEL);
525         else
526                 h = (struct hlist_head *)
527                         __get_free_pages(GFP_KERNEL, get_order(size));
528
529         if (h != NULL) {
530                 for (i = 0; i < n; i++)
531                         INIT_HLIST_HEAD(&h[i]);
532         }
533         return h;
534 }
535
536 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537 {
538         unsigned int size = n * sizeof(struct hlist_head);
539
540         if (size <= PAGE_SIZE)
541                 kfree(h);
542         else
543                 free_pages((unsigned long)h, get_order(size));
544 }
545
546 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547 {
548         struct Qdisc_class_common *cl;
549         struct hlist_node *n, *next;
550         struct hlist_head *nhash, *ohash;
551         unsigned int nsize, nmask, osize;
552         unsigned int i, h;
553
554         /* Rehash when load factor exceeds 0.75 */
555         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556                 return;
557         nsize = clhash->hashsize * 2;
558         nmask = nsize - 1;
559         nhash = qdisc_class_hash_alloc(nsize);
560         if (nhash == NULL)
561                 return;
562
563         ohash = clhash->hash;
564         osize = clhash->hashsize;
565
566         sch_tree_lock(sch);
567         for (i = 0; i < osize; i++) {
568                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
569                         h = qdisc_class_hash(cl->classid, nmask);
570                         hlist_add_head(&cl->hnode, &nhash[h]);
571                 }
572         }
573         clhash->hash     = nhash;
574         clhash->hashsize = nsize;
575         clhash->hashmask = nmask;
576         sch_tree_unlock(sch);
577
578         qdisc_class_hash_free(ohash, osize);
579 }
580 EXPORT_SYMBOL(qdisc_class_hash_grow);
581
582 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
583 {
584         unsigned int size = 4;
585
586         clhash->hash = qdisc_class_hash_alloc(size);
587         if (clhash->hash == NULL)
588                 return -ENOMEM;
589         clhash->hashsize  = size;
590         clhash->hashmask  = size - 1;
591         clhash->hashelems = 0;
592         return 0;
593 }
594 EXPORT_SYMBOL(qdisc_class_hash_init);
595
596 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
597 {
598         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599 }
600 EXPORT_SYMBOL(qdisc_class_hash_destroy);
601
602 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
603                              struct Qdisc_class_common *cl)
604 {
605         unsigned int h;
606
607         INIT_HLIST_NODE(&cl->hnode);
608         h = qdisc_class_hash(cl->classid, clhash->hashmask);
609         hlist_add_head(&cl->hnode, &clhash->hash[h]);
610         clhash->hashelems++;
611 }
612 EXPORT_SYMBOL(qdisc_class_hash_insert);
613
614 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
615                              struct Qdisc_class_common *cl)
616 {
617         hlist_del(&cl->hnode);
618         clhash->hashelems--;
619 }
620 EXPORT_SYMBOL(qdisc_class_hash_remove);
621
622 /* Allocate an unique handle from space managed by kernel
623  * Possible range is [8000-FFFF]:0000 (0x8000 values)
624  */
625 static u32 qdisc_alloc_handle(struct net_device *dev)
626 {
627         int i = 0x8000;
628         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629
630         do {
631                 autohandle += TC_H_MAKE(0x10000U, 0);
632                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633                         autohandle = TC_H_MAKE(0x80000000U, 0);
634                 if (!qdisc_lookup(dev, autohandle))
635                         return autohandle;
636                 cond_resched();
637         } while (--i > 0);
638
639         return 0;
640 }
641
642 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643 {
644         const struct Qdisc_class_ops *cops;
645         unsigned long cl;
646         u32 parentid;
647
648         if (n == 0)
649                 return;
650         while ((parentid = sch->parent)) {
651                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652                         return;
653
654                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655                 if (sch == NULL) {
656                         WARN_ON(parentid != TC_H_ROOT);
657                         return;
658                 }
659                 cops = sch->ops->cl_ops;
660                 if (cops->qlen_notify) {
661                         cl = cops->get(sch, parentid);
662                         cops->qlen_notify(sch, cl);
663                         cops->put(sch, cl);
664                 }
665                 sch->q.qlen -= n;
666         }
667 }
668 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
669
670 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671                                struct nlmsghdr *n, u32 clid,
672                                struct Qdisc *old, struct Qdisc *new)
673 {
674         if (new || old)
675                 qdisc_notify(net, skb, n, clid, old, new);
676
677         if (old)
678                 qdisc_destroy(old);
679 }
680
681 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
682  * to device "dev".
683  *
684  * When appropriate send a netlink notification using 'skb'
685  * and "n".
686  *
687  * On success, destroy old qdisc.
688  */
689
690 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692                        struct Qdisc *new, struct Qdisc *old)
693 {
694         struct Qdisc *q = old;
695         struct net *net = dev_net(dev);
696         int err = 0;
697
698         if (parent == NULL) {
699                 unsigned int i, num_q, ingress;
700
701                 ingress = 0;
702                 num_q = dev->num_tx_queues;
703                 if ((q && q->flags & TCQ_F_INGRESS) ||
704                     (new && new->flags & TCQ_F_INGRESS)) {
705                         num_q = 1;
706                         ingress = 1;
707                         if (!dev_ingress_queue(dev))
708                                 return -ENOENT;
709                 }
710
711                 if (dev->flags & IFF_UP)
712                         dev_deactivate(dev);
713
714                 if (new && new->ops->attach) {
715                         new->ops->attach(new);
716                         num_q = 0;
717                 }
718
719                 for (i = 0; i < num_q; i++) {
720                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721
722                         if (!ingress)
723                                 dev_queue = netdev_get_tx_queue(dev, i);
724
725                         old = dev_graft_qdisc(dev_queue, new);
726                         if (new && i > 0)
727                                 atomic_inc(&new->refcnt);
728
729                         if (!ingress)
730                                 qdisc_destroy(old);
731                 }
732
733                 if (!ingress) {
734                         notify_and_destroy(net, skb, n, classid,
735                                            dev->qdisc, new);
736                         if (new && !new->ops->attach)
737                                 atomic_inc(&new->refcnt);
738                         dev->qdisc = new ? : &noop_qdisc;
739                 } else {
740                         notify_and_destroy(net, skb, n, classid, old, new);
741                 }
742
743                 if (dev->flags & IFF_UP)
744                         dev_activate(dev);
745         } else {
746                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747
748                 err = -EOPNOTSUPP;
749                 if (cops && cops->graft) {
750                         unsigned long cl = cops->get(parent, classid);
751                         if (cl) {
752                                 err = cops->graft(parent, cl, new, &old);
753                                 cops->put(parent, cl);
754                         } else
755                                 err = -ENOENT;
756                 }
757                 if (!err)
758                         notify_and_destroy(net, skb, n, classid, old, new);
759         }
760         return err;
761 }
762
763 /* lockdep annotation is needed for ingress; egress gets it only for name */
764 static struct lock_class_key qdisc_tx_lock;
765 static struct lock_class_key qdisc_rx_lock;
766
767 /*
768    Allocate and initialize new qdisc.
769
770    Parameters are passed via opt.
771  */
772
773 static struct Qdisc *
774 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775              struct Qdisc *p, u32 parent, u32 handle,
776              struct nlattr **tca, int *errp)
777 {
778         int err;
779         struct nlattr *kind = tca[TCA_KIND];
780         struct Qdisc *sch;
781         struct Qdisc_ops *ops;
782         struct qdisc_size_table *stab;
783
784         ops = qdisc_lookup_ops(kind);
785 #ifdef CONFIG_MODULES
786         if (ops == NULL && kind != NULL) {
787                 char name[IFNAMSIZ];
788                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789                         /* We dropped the RTNL semaphore in order to
790                          * perform the module load.  So, even if we
791                          * succeeded in loading the module we have to
792                          * tell the caller to replay the request.  We
793                          * indicate this using -EAGAIN.
794                          * We replay the request because the device may
795                          * go away in the mean time.
796                          */
797                         rtnl_unlock();
798                         request_module("sch_%s", name);
799                         rtnl_lock();
800                         ops = qdisc_lookup_ops(kind);
801                         if (ops != NULL) {
802                                 /* We will try again qdisc_lookup_ops,
803                                  * so don't keep a reference.
804                                  */
805                                 module_put(ops->owner);
806                                 err = -EAGAIN;
807                                 goto err_out;
808                         }
809                 }
810         }
811 #endif
812
813         err = -ENOENT;
814         if (ops == NULL)
815                 goto err_out;
816
817         sch = qdisc_alloc(dev_queue, ops);
818         if (IS_ERR(sch)) {
819                 err = PTR_ERR(sch);
820                 goto err_out2;
821         }
822
823         sch->parent = parent;
824
825         if (handle == TC_H_INGRESS) {
826                 sch->flags |= TCQ_F_INGRESS;
827                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
828                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829         } else {
830                 if (handle == 0) {
831                         handle = qdisc_alloc_handle(dev);
832                         err = -ENOMEM;
833                         if (handle == 0)
834                                 goto err_out3;
835                 }
836                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837         }
838
839         sch->handle = handle;
840
841         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
842                 if (tca[TCA_STAB]) {
843                         stab = qdisc_get_stab(tca[TCA_STAB]);
844                         if (IS_ERR(stab)) {
845                                 err = PTR_ERR(stab);
846                                 goto err_out4;
847                         }
848                         rcu_assign_pointer(sch->stab, stab);
849                 }
850                 if (tca[TCA_RATE]) {
851                         spinlock_t *root_lock;
852
853                         err = -EOPNOTSUPP;
854                         if (sch->flags & TCQ_F_MQROOT)
855                                 goto err_out4;
856
857                         if ((sch->parent != TC_H_ROOT) &&
858                             !(sch->flags & TCQ_F_INGRESS) &&
859                             (!p || !(p->flags & TCQ_F_MQROOT)))
860                                 root_lock = qdisc_root_sleeping_lock(sch);
861                         else
862                                 root_lock = qdisc_lock(sch);
863
864                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
865                                                 root_lock, tca[TCA_RATE]);
866                         if (err)
867                                 goto err_out4;
868                 }
869
870                 qdisc_list_add(sch);
871
872                 return sch;
873         }
874 err_out3:
875         dev_put(dev);
876         kfree((char *) sch - sch->padded);
877 err_out2:
878         module_put(ops->owner);
879 err_out:
880         *errp = err;
881         return NULL;
882
883 err_out4:
884         /*
885          * Any broken qdiscs that would require a ops->reset() here?
886          * The qdisc was never in action so it shouldn't be necessary.
887          */
888         qdisc_put_stab(rtnl_dereference(sch->stab));
889         if (ops->destroy)
890                 ops->destroy(sch);
891         goto err_out3;
892 }
893
894 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
895 {
896         struct qdisc_size_table *ostab, *stab = NULL;
897         int err = 0;
898
899         if (tca[TCA_OPTIONS]) {
900                 if (sch->ops->change == NULL)
901                         return -EINVAL;
902                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
903                 if (err)
904                         return err;
905         }
906
907         if (tca[TCA_STAB]) {
908                 stab = qdisc_get_stab(tca[TCA_STAB]);
909                 if (IS_ERR(stab))
910                         return PTR_ERR(stab);
911         }
912
913         ostab = rtnl_dereference(sch->stab);
914         rcu_assign_pointer(sch->stab, stab);
915         qdisc_put_stab(ostab);
916
917         if (tca[TCA_RATE]) {
918                 /* NB: ignores errors from replace_estimator
919                    because change can't be undone. */
920                 if (sch->flags & TCQ_F_MQROOT)
921                         goto out;
922                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
923                                             qdisc_root_sleeping_lock(sch),
924                                             tca[TCA_RATE]);
925         }
926 out:
927         return 0;
928 }
929
930 struct check_loop_arg {
931         struct qdisc_walker     w;
932         struct Qdisc            *p;
933         int                     depth;
934 };
935
936 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
937
938 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
939 {
940         struct check_loop_arg   arg;
941
942         if (q->ops->cl_ops == NULL)
943                 return 0;
944
945         arg.w.stop = arg.w.skip = arg.w.count = 0;
946         arg.w.fn = check_loop_fn;
947         arg.depth = depth;
948         arg.p = p;
949         q->ops->cl_ops->walk(q, &arg.w);
950         return arg.w.stop ? -ELOOP : 0;
951 }
952
953 static int
954 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
955 {
956         struct Qdisc *leaf;
957         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
958         struct check_loop_arg *arg = (struct check_loop_arg *)w;
959
960         leaf = cops->leaf(q, cl);
961         if (leaf) {
962                 if (leaf == arg->p || arg->depth > 7)
963                         return -ELOOP;
964                 return check_loop(leaf, arg->p, arg->depth + 1);
965         }
966         return 0;
967 }
968
969 /*
970  * Delete/get qdisc.
971  */
972
973 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
974 {
975         struct net *net = sock_net(skb->sk);
976         struct tcmsg *tcm = nlmsg_data(n);
977         struct nlattr *tca[TCA_MAX + 1];
978         struct net_device *dev;
979         u32 clid = tcm->tcm_parent;
980         struct Qdisc *q = NULL;
981         struct Qdisc *p = NULL;
982         int err;
983
984         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
985         if (!dev)
986                 return -ENODEV;
987
988         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
989         if (err < 0)
990                 return err;
991
992         if (clid) {
993                 if (clid != TC_H_ROOT) {
994                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
995                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
996                                 if (!p)
997                                         return -ENOENT;
998                                 q = qdisc_leaf(p, clid);
999                         } else if (dev_ingress_queue(dev)) {
1000                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1001                         }
1002                 } else {
1003                         q = dev->qdisc;
1004                 }
1005                 if (!q)
1006                         return -ENOENT;
1007
1008                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1009                         return -EINVAL;
1010         } else {
1011                 q = qdisc_lookup(dev, tcm->tcm_handle);
1012                 if (!q)
1013                         return -ENOENT;
1014         }
1015
1016         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1017                 return -EINVAL;
1018
1019         if (n->nlmsg_type == RTM_DELQDISC) {
1020                 if (!clid)
1021                         return -EINVAL;
1022                 if (q->handle == 0)
1023                         return -ENOENT;
1024                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1025                 if (err != 0)
1026                         return err;
1027         } else {
1028                 qdisc_notify(net, skb, n, clid, NULL, q);
1029         }
1030         return 0;
1031 }
1032
1033 /*
1034  * Create/change qdisc.
1035  */
1036
1037 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1038 {
1039         struct net *net = sock_net(skb->sk);
1040         struct tcmsg *tcm;
1041         struct nlattr *tca[TCA_MAX + 1];
1042         struct net_device *dev;
1043         u32 clid;
1044         struct Qdisc *q, *p;
1045         int err;
1046
1047 replay:
1048         /* Reinit, just in case something touches this. */
1049         tcm = nlmsg_data(n);
1050         clid = tcm->tcm_parent;
1051         q = p = NULL;
1052
1053         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1054         if (!dev)
1055                 return -ENODEV;
1056
1057         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1058         if (err < 0)
1059                 return err;
1060
1061         if (clid) {
1062                 if (clid != TC_H_ROOT) {
1063                         if (clid != TC_H_INGRESS) {
1064                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1065                                 if (!p)
1066                                         return -ENOENT;
1067                                 q = qdisc_leaf(p, clid);
1068                         } else if (dev_ingress_queue_create(dev)) {
1069                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1070                         }
1071                 } else {
1072                         q = dev->qdisc;
1073                 }
1074
1075                 /* It may be default qdisc, ignore it */
1076                 if (q && q->handle == 0)
1077                         q = NULL;
1078
1079                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1080                         if (tcm->tcm_handle) {
1081                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1082                                         return -EEXIST;
1083                                 if (TC_H_MIN(tcm->tcm_handle))
1084                                         return -EINVAL;
1085                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1086                                 if (!q)
1087                                         goto create_n_graft;
1088                                 if (n->nlmsg_flags & NLM_F_EXCL)
1089                                         return -EEXIST;
1090                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1091                                         return -EINVAL;
1092                                 if (q == p ||
1093                                     (p && check_loop(q, p, 0)))
1094                                         return -ELOOP;
1095                                 atomic_inc(&q->refcnt);
1096                                 goto graft;
1097                         } else {
1098                                 if (!q)
1099                                         goto create_n_graft;
1100
1101                                 /* This magic test requires explanation.
1102                                  *
1103                                  *   We know, that some child q is already
1104                                  *   attached to this parent and have choice:
1105                                  *   either to change it or to create/graft new one.
1106                                  *
1107                                  *   1. We are allowed to create/graft only
1108                                  *   if CREATE and REPLACE flags are set.
1109                                  *
1110                                  *   2. If EXCL is set, requestor wanted to say,
1111                                  *   that qdisc tcm_handle is not expected
1112                                  *   to exist, so that we choose create/graft too.
1113                                  *
1114                                  *   3. The last case is when no flags are set.
1115                                  *   Alas, it is sort of hole in API, we
1116                                  *   cannot decide what to do unambiguously.
1117                                  *   For now we select create/graft, if
1118                                  *   user gave KIND, which does not match existing.
1119                                  */
1120                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1121                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1122                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1123                                      (tca[TCA_KIND] &&
1124                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1125                                         goto create_n_graft;
1126                         }
1127                 }
1128         } else {
1129                 if (!tcm->tcm_handle)
1130                         return -EINVAL;
1131                 q = qdisc_lookup(dev, tcm->tcm_handle);
1132         }
1133
1134         /* Change qdisc parameters */
1135         if (q == NULL)
1136                 return -ENOENT;
1137         if (n->nlmsg_flags & NLM_F_EXCL)
1138                 return -EEXIST;
1139         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1140                 return -EINVAL;
1141         err = qdisc_change(q, tca);
1142         if (err == 0)
1143                 qdisc_notify(net, skb, n, clid, NULL, q);
1144         return err;
1145
1146 create_n_graft:
1147         if (!(n->nlmsg_flags & NLM_F_CREATE))
1148                 return -ENOENT;
1149         if (clid == TC_H_INGRESS) {
1150                 if (dev_ingress_queue(dev))
1151                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1152                                          tcm->tcm_parent, tcm->tcm_parent,
1153                                          tca, &err);
1154                 else
1155                         err = -ENOENT;
1156         } else {
1157                 struct netdev_queue *dev_queue;
1158
1159                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1160                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1161                 else if (p)
1162                         dev_queue = p->dev_queue;
1163                 else
1164                         dev_queue = netdev_get_tx_queue(dev, 0);
1165
1166                 q = qdisc_create(dev, dev_queue, p,
1167                                  tcm->tcm_parent, tcm->tcm_handle,
1168                                  tca, &err);
1169         }
1170         if (q == NULL) {
1171                 if (err == -EAGAIN)
1172                         goto replay;
1173                 return err;
1174         }
1175
1176 graft:
1177         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1178         if (err) {
1179                 if (q)
1180                         qdisc_destroy(q);
1181                 return err;
1182         }
1183
1184         return 0;
1185 }
1186
1187 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1188                          u32 pid, u32 seq, u16 flags, int event)
1189 {
1190         struct tcmsg *tcm;
1191         struct nlmsghdr  *nlh;
1192         unsigned char *b = skb_tail_pointer(skb);
1193         struct gnet_dump d;
1194         struct qdisc_size_table *stab;
1195
1196         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags);
1197         if (!nlh)
1198                 goto out_nlmsg_trim;
1199         tcm = nlmsg_data(nlh);
1200         tcm->tcm_family = AF_UNSPEC;
1201         tcm->tcm__pad1 = 0;
1202         tcm->tcm__pad2 = 0;
1203         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1204         tcm->tcm_parent = clid;
1205         tcm->tcm_handle = q->handle;
1206         tcm->tcm_info = atomic_read(&q->refcnt);
1207         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1208                 goto nla_put_failure;
1209         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1210                 goto nla_put_failure;
1211         q->qstats.qlen = q->q.qlen;
1212
1213         stab = rtnl_dereference(q->stab);
1214         if (stab && qdisc_dump_stab(skb, stab) < 0)
1215                 goto nla_put_failure;
1216
1217         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1218                                          qdisc_root_sleeping_lock(q), &d) < 0)
1219                 goto nla_put_failure;
1220
1221         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1222                 goto nla_put_failure;
1223
1224         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1225             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1226             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1227                 goto nla_put_failure;
1228
1229         if (gnet_stats_finish_copy(&d) < 0)
1230                 goto nla_put_failure;
1231
1232         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1233         return skb->len;
1234
1235 out_nlmsg_trim:
1236 nla_put_failure:
1237         nlmsg_trim(skb, b);
1238         return -1;
1239 }
1240
1241 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1242 {
1243         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1244 }
1245
1246 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1247                         struct nlmsghdr *n, u32 clid,
1248                         struct Qdisc *old, struct Qdisc *new)
1249 {
1250         struct sk_buff *skb;
1251         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1252
1253         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1254         if (!skb)
1255                 return -ENOBUFS;
1256
1257         if (old && !tc_qdisc_dump_ignore(old)) {
1258                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1259                                   0, RTM_DELQDISC) < 0)
1260                         goto err_out;
1261         }
1262         if (new && !tc_qdisc_dump_ignore(new)) {
1263                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1264                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1265                         goto err_out;
1266         }
1267
1268         if (skb->len)
1269                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1270                                       n->nlmsg_flags & NLM_F_ECHO);
1271
1272 err_out:
1273         kfree_skb(skb);
1274         return -EINVAL;
1275 }
1276
1277 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1278                               struct netlink_callback *cb,
1279                               int *q_idx_p, int s_q_idx)
1280 {
1281         int ret = 0, q_idx = *q_idx_p;
1282         struct Qdisc *q;
1283
1284         if (!root)
1285                 return 0;
1286
1287         q = root;
1288         if (q_idx < s_q_idx) {
1289                 q_idx++;
1290         } else {
1291                 if (!tc_qdisc_dump_ignore(q) &&
1292                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1293                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1294                         goto done;
1295                 q_idx++;
1296         }
1297         list_for_each_entry(q, &root->list, list) {
1298                 if (q_idx < s_q_idx) {
1299                         q_idx++;
1300                         continue;
1301                 }
1302                 if (!tc_qdisc_dump_ignore(q) &&
1303                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1304                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1305                         goto done;
1306                 q_idx++;
1307         }
1308
1309 out:
1310         *q_idx_p = q_idx;
1311         return ret;
1312 done:
1313         ret = -1;
1314         goto out;
1315 }
1316
1317 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1318 {
1319         struct net *net = sock_net(skb->sk);
1320         int idx, q_idx;
1321         int s_idx, s_q_idx;
1322         struct net_device *dev;
1323
1324         s_idx = cb->args[0];
1325         s_q_idx = q_idx = cb->args[1];
1326
1327         rcu_read_lock();
1328         idx = 0;
1329         for_each_netdev_rcu(net, dev) {
1330                 struct netdev_queue *dev_queue;
1331
1332                 if (idx < s_idx)
1333                         goto cont;
1334                 if (idx > s_idx)
1335                         s_q_idx = 0;
1336                 q_idx = 0;
1337
1338                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1339                         goto done;
1340
1341                 dev_queue = dev_ingress_queue(dev);
1342                 if (dev_queue &&
1343                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1344                                        &q_idx, s_q_idx) < 0)
1345                         goto done;
1346
1347 cont:
1348                 idx++;
1349         }
1350
1351 done:
1352         rcu_read_unlock();
1353
1354         cb->args[0] = idx;
1355         cb->args[1] = q_idx;
1356
1357         return skb->len;
1358 }
1359
1360
1361
1362 /************************************************
1363  *      Traffic classes manipulation.           *
1364  ************************************************/
1365
1366
1367
1368 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1369 {
1370         struct net *net = sock_net(skb->sk);
1371         struct tcmsg *tcm = nlmsg_data(n);
1372         struct nlattr *tca[TCA_MAX + 1];
1373         struct net_device *dev;
1374         struct Qdisc *q = NULL;
1375         const struct Qdisc_class_ops *cops;
1376         unsigned long cl = 0;
1377         unsigned long new_cl;
1378         u32 pid = tcm->tcm_parent;
1379         u32 clid = tcm->tcm_handle;
1380         u32 qid = TC_H_MAJ(clid);
1381         int err;
1382
1383         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1384         if (!dev)
1385                 return -ENODEV;
1386
1387         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1388         if (err < 0)
1389                 return err;
1390
1391         /*
1392            parent == TC_H_UNSPEC - unspecified parent.
1393            parent == TC_H_ROOT   - class is root, which has no parent.
1394            parent == X:0         - parent is root class.
1395            parent == X:Y         - parent is a node in hierarchy.
1396            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1397
1398            handle == 0:0         - generate handle from kernel pool.
1399            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1400            handle == X:Y         - clear.
1401            handle == X:0         - root class.
1402          */
1403
1404         /* Step 1. Determine qdisc handle X:0 */
1405
1406         if (pid != TC_H_ROOT) {
1407                 u32 qid1 = TC_H_MAJ(pid);
1408
1409                 if (qid && qid1) {
1410                         /* If both majors are known, they must be identical. */
1411                         if (qid != qid1)
1412                                 return -EINVAL;
1413                 } else if (qid1) {
1414                         qid = qid1;
1415                 } else if (qid == 0)
1416                         qid = dev->qdisc->handle;
1417
1418                 /* Now qid is genuine qdisc handle consistent
1419                  * both with parent and child.
1420                  *
1421                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1422                  */
1423                 if (pid)
1424                         pid = TC_H_MAKE(qid, pid);
1425         } else {
1426                 if (qid == 0)
1427                         qid = dev->qdisc->handle;
1428         }
1429
1430         /* OK. Locate qdisc */
1431         q = qdisc_lookup(dev, qid);
1432         if (!q)
1433                 return -ENOENT;
1434
1435         /* An check that it supports classes */
1436         cops = q->ops->cl_ops;
1437         if (cops == NULL)
1438                 return -EINVAL;
1439
1440         /* Now try to get class */
1441         if (clid == 0) {
1442                 if (pid == TC_H_ROOT)
1443                         clid = qid;
1444         } else
1445                 clid = TC_H_MAKE(qid, clid);
1446
1447         if (clid)
1448                 cl = cops->get(q, clid);
1449
1450         if (cl == 0) {
1451                 err = -ENOENT;
1452                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1453                     !(n->nlmsg_flags & NLM_F_CREATE))
1454                         goto out;
1455         } else {
1456                 switch (n->nlmsg_type) {
1457                 case RTM_NEWTCLASS:
1458                         err = -EEXIST;
1459                         if (n->nlmsg_flags & NLM_F_EXCL)
1460                                 goto out;
1461                         break;
1462                 case RTM_DELTCLASS:
1463                         err = -EOPNOTSUPP;
1464                         if (cops->delete)
1465                                 err = cops->delete(q, cl);
1466                         if (err == 0)
1467                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1468                         goto out;
1469                 case RTM_GETTCLASS:
1470                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1471                         goto out;
1472                 default:
1473                         err = -EINVAL;
1474                         goto out;
1475                 }
1476         }
1477
1478         new_cl = cl;
1479         err = -EOPNOTSUPP;
1480         if (cops->change)
1481                 err = cops->change(q, clid, pid, tca, &new_cl);
1482         if (err == 0)
1483                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1484
1485 out:
1486         if (cl)
1487                 cops->put(q, cl);
1488
1489         return err;
1490 }
1491
1492
1493 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1494                           unsigned long cl,
1495                           u32 pid, u32 seq, u16 flags, int event)
1496 {
1497         struct tcmsg *tcm;
1498         struct nlmsghdr  *nlh;
1499         unsigned char *b = skb_tail_pointer(skb);
1500         struct gnet_dump d;
1501         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1502
1503         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags);
1504         if (!nlh)
1505                 goto out_nlmsg_trim;
1506         tcm = nlmsg_data(nlh);
1507         tcm->tcm_family = AF_UNSPEC;
1508         tcm->tcm__pad1 = 0;
1509         tcm->tcm__pad2 = 0;
1510         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1511         tcm->tcm_parent = q->handle;
1512         tcm->tcm_handle = q->handle;
1513         tcm->tcm_info = 0;
1514         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1515                 goto nla_put_failure;
1516         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1517                 goto nla_put_failure;
1518
1519         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1520                                          qdisc_root_sleeping_lock(q), &d) < 0)
1521                 goto nla_put_failure;
1522
1523         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1524                 goto nla_put_failure;
1525
1526         if (gnet_stats_finish_copy(&d) < 0)
1527                 goto nla_put_failure;
1528
1529         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1530         return skb->len;
1531
1532 out_nlmsg_trim:
1533 nla_put_failure:
1534         nlmsg_trim(skb, b);
1535         return -1;
1536 }
1537
1538 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1539                          struct nlmsghdr *n, struct Qdisc *q,
1540                          unsigned long cl, int event)
1541 {
1542         struct sk_buff *skb;
1543         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1544
1545         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1546         if (!skb)
1547                 return -ENOBUFS;
1548
1549         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1550                 kfree_skb(skb);
1551                 return -EINVAL;
1552         }
1553
1554         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1555                               n->nlmsg_flags & NLM_F_ECHO);
1556 }
1557
1558 struct qdisc_dump_args {
1559         struct qdisc_walker     w;
1560         struct sk_buff          *skb;
1561         struct netlink_callback *cb;
1562 };
1563
1564 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1565 {
1566         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1567
1568         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1569                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1570 }
1571
1572 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1573                                 struct tcmsg *tcm, struct netlink_callback *cb,
1574                                 int *t_p, int s_t)
1575 {
1576         struct qdisc_dump_args arg;
1577
1578         if (tc_qdisc_dump_ignore(q) ||
1579             *t_p < s_t || !q->ops->cl_ops ||
1580             (tcm->tcm_parent &&
1581              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1582                 (*t_p)++;
1583                 return 0;
1584         }
1585         if (*t_p > s_t)
1586                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1587         arg.w.fn = qdisc_class_dump;
1588         arg.skb = skb;
1589         arg.cb = cb;
1590         arg.w.stop  = 0;
1591         arg.w.skip = cb->args[1];
1592         arg.w.count = 0;
1593         q->ops->cl_ops->walk(q, &arg.w);
1594         cb->args[1] = arg.w.count;
1595         if (arg.w.stop)
1596                 return -1;
1597         (*t_p)++;
1598         return 0;
1599 }
1600
1601 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1602                                struct tcmsg *tcm, struct netlink_callback *cb,
1603                                int *t_p, int s_t)
1604 {
1605         struct Qdisc *q;
1606
1607         if (!root)
1608                 return 0;
1609
1610         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1611                 return -1;
1612
1613         list_for_each_entry(q, &root->list, list) {
1614                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1615                         return -1;
1616         }
1617
1618         return 0;
1619 }
1620
1621 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1622 {
1623         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1624         struct net *net = sock_net(skb->sk);
1625         struct netdev_queue *dev_queue;
1626         struct net_device *dev;
1627         int t, s_t;
1628
1629         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1630                 return 0;
1631         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1632         if (!dev)
1633                 return 0;
1634
1635         s_t = cb->args[0];
1636         t = 0;
1637
1638         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1639                 goto done;
1640
1641         dev_queue = dev_ingress_queue(dev);
1642         if (dev_queue &&
1643             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1644                                 &t, s_t) < 0)
1645                 goto done;
1646
1647 done:
1648         cb->args[0] = t;
1649
1650         dev_put(dev);
1651         return skb->len;
1652 }
1653
1654 /* Main classifier routine: scans classifier chain attached
1655  * to this qdisc, (optionally) tests for protocol and asks
1656  * specific classifiers.
1657  */
1658 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1659                        struct tcf_result *res)
1660 {
1661         __be16 protocol = skb->protocol;
1662         int err;
1663
1664         for (; tp; tp = tp->next) {
1665                 if (tp->protocol != protocol &&
1666                     tp->protocol != htons(ETH_P_ALL))
1667                         continue;
1668                 err = tp->classify(skb, tp, res);
1669
1670                 if (err >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1673                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1674 #endif
1675                         return err;
1676                 }
1677         }
1678         return -1;
1679 }
1680 EXPORT_SYMBOL(tc_classify_compat);
1681
1682 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1683                 struct tcf_result *res)
1684 {
1685         int err = 0;
1686 #ifdef CONFIG_NET_CLS_ACT
1687         const struct tcf_proto *otp = tp;
1688 reclassify:
1689 #endif
1690
1691         err = tc_classify_compat(skb, tp, res);
1692 #ifdef CONFIG_NET_CLS_ACT
1693         if (err == TC_ACT_RECLASSIFY) {
1694                 u32 verd = G_TC_VERD(skb->tc_verd);
1695                 tp = otp;
1696
1697                 if (verd++ >= MAX_REC_LOOP) {
1698                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1699                                                tp->q->ops->id,
1700                                                tp->prio & 0xffff,
1701                                                ntohs(tp->protocol));
1702                         return TC_ACT_SHOT;
1703                 }
1704                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1705                 goto reclassify;
1706         }
1707 #endif
1708         return err;
1709 }
1710 EXPORT_SYMBOL(tc_classify);
1711
1712 void tcf_destroy(struct tcf_proto *tp)
1713 {
1714         tp->ops->destroy(tp);
1715         module_put(tp->ops->owner);
1716         kfree(tp);
1717 }
1718
1719 void tcf_destroy_chain(struct tcf_proto **fl)
1720 {
1721         struct tcf_proto *tp;
1722
1723         while ((tp = *fl) != NULL) {
1724                 *fl = tp->next;
1725                 tcf_destroy(tp);
1726         }
1727 }
1728 EXPORT_SYMBOL(tcf_destroy_chain);
1729
1730 #ifdef CONFIG_PROC_FS
1731 static int psched_show(struct seq_file *seq, void *v)
1732 {
1733         struct timespec ts;
1734
1735         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1736         seq_printf(seq, "%08x %08x %08x %08x\n",
1737                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1738                    1000000,
1739                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1740
1741         return 0;
1742 }
1743
1744 static int psched_open(struct inode *inode, struct file *file)
1745 {
1746         return single_open(file, psched_show, NULL);
1747 }
1748
1749 static const struct file_operations psched_fops = {
1750         .owner = THIS_MODULE,
1751         .open = psched_open,
1752         .read  = seq_read,
1753         .llseek = seq_lseek,
1754         .release = single_release,
1755 };
1756
1757 static int __net_init psched_net_init(struct net *net)
1758 {
1759         struct proc_dir_entry *e;
1760
1761         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1762         if (e == NULL)
1763                 return -ENOMEM;
1764
1765         return 0;
1766 }
1767
1768 static void __net_exit psched_net_exit(struct net *net)
1769 {
1770         proc_net_remove(net, "psched");
1771 }
1772 #else
1773 static int __net_init psched_net_init(struct net *net)
1774 {
1775         return 0;
1776 }
1777
1778 static void __net_exit psched_net_exit(struct net *net)
1779 {
1780 }
1781 #endif
1782
1783 static struct pernet_operations psched_net_ops = {
1784         .init = psched_net_init,
1785         .exit = psched_net_exit,
1786 };
1787
1788 static int __init pktsched_init(void)
1789 {
1790         int err;
1791
1792         err = register_pernet_subsys(&psched_net_ops);
1793         if (err) {
1794                 pr_err("pktsched_init: "
1795                        "cannot initialize per netns operations\n");
1796                 return err;
1797         }
1798
1799         register_qdisc(&pfifo_qdisc_ops);
1800         register_qdisc(&bfifo_qdisc_ops);
1801         register_qdisc(&pfifo_head_drop_qdisc_ops);
1802         register_qdisc(&mq_qdisc_ops);
1803
1804         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1805         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1806         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1807         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1808         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1809         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1810
1811         return 0;
1812 }
1813
1814 subsys_initcall(pktsched_init);