hwmon: (applesmc) Ignore some temperature registers
[pandora-kernel.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
430                 goto nla_put_failure;
431         nla_nest_end(skb, nest);
432
433         return skb->len;
434
435 nla_put_failure:
436         return -1;
437 }
438
439 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
440 {
441         int pkt_len, slot;
442
443         pkt_len = skb->len + stab->szopts.overhead;
444         if (unlikely(!stab->szopts.tsize))
445                 goto out;
446
447         slot = pkt_len + stab->szopts.cell_align;
448         if (unlikely(slot < 0))
449                 slot = 0;
450
451         slot >>= stab->szopts.cell_log;
452         if (likely(slot < stab->szopts.tsize))
453                 pkt_len = stab->data[slot];
454         else
455                 pkt_len = stab->data[stab->szopts.tsize - 1] *
456                                 (slot / stab->szopts.tsize) +
457                                 stab->data[slot % stab->szopts.tsize];
458
459         pkt_len <<= stab->szopts.size_log;
460 out:
461         if (unlikely(pkt_len < 1))
462                 pkt_len = 1;
463         qdisc_skb_cb(skb)->pkt_len = pkt_len;
464 }
465 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
466
467 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
468 {
469         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
470                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
471                         txt, qdisc->ops->id, qdisc->handle >> 16);
472                 qdisc->flags |= TCQ_F_WARN_NONWC;
473         }
474 }
475 EXPORT_SYMBOL(qdisc_warn_nonwc);
476
477 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
478 {
479         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
480                                                  timer);
481
482         qdisc_unthrottled(wd->qdisc);
483         __netif_schedule(qdisc_root(wd->qdisc));
484
485         return HRTIMER_NORESTART;
486 }
487
488 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
489 {
490         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
491         wd->timer.function = qdisc_watchdog;
492         wd->qdisc = qdisc;
493 }
494 EXPORT_SYMBOL(qdisc_watchdog_init);
495
496 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
497 {
498         ktime_t time;
499
500         if (test_bit(__QDISC_STATE_DEACTIVATED,
501                      &qdisc_root_sleeping(wd->qdisc)->state))
502                 return;
503
504         qdisc_throttled(wd->qdisc);
505         time = ktime_set(0, 0);
506         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
507         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
508 }
509 EXPORT_SYMBOL(qdisc_watchdog_schedule);
510
511 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
512 {
513         hrtimer_cancel(&wd->timer);
514         qdisc_unthrottled(wd->qdisc);
515 }
516 EXPORT_SYMBOL(qdisc_watchdog_cancel);
517
518 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519 {
520         unsigned int size = n * sizeof(struct hlist_head), i;
521         struct hlist_head *h;
522
523         if (size <= PAGE_SIZE)
524                 h = kmalloc(size, GFP_KERNEL);
525         else
526                 h = (struct hlist_head *)
527                         __get_free_pages(GFP_KERNEL, get_order(size));
528
529         if (h != NULL) {
530                 for (i = 0; i < n; i++)
531                         INIT_HLIST_HEAD(&h[i]);
532         }
533         return h;
534 }
535
536 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537 {
538         unsigned int size = n * sizeof(struct hlist_head);
539
540         if (size <= PAGE_SIZE)
541                 kfree(h);
542         else
543                 free_pages((unsigned long)h, get_order(size));
544 }
545
546 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547 {
548         struct Qdisc_class_common *cl;
549         struct hlist_node *n, *next;
550         struct hlist_head *nhash, *ohash;
551         unsigned int nsize, nmask, osize;
552         unsigned int i, h;
553
554         /* Rehash when load factor exceeds 0.75 */
555         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556                 return;
557         nsize = clhash->hashsize * 2;
558         nmask = nsize - 1;
559         nhash = qdisc_class_hash_alloc(nsize);
560         if (nhash == NULL)
561                 return;
562
563         ohash = clhash->hash;
564         osize = clhash->hashsize;
565
566         sch_tree_lock(sch);
567         for (i = 0; i < osize; i++) {
568                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
569                         h = qdisc_class_hash(cl->classid, nmask);
570                         hlist_add_head(&cl->hnode, &nhash[h]);
571                 }
572         }
573         clhash->hash     = nhash;
574         clhash->hashsize = nsize;
575         clhash->hashmask = nmask;
576         sch_tree_unlock(sch);
577
578         qdisc_class_hash_free(ohash, osize);
579 }
580 EXPORT_SYMBOL(qdisc_class_hash_grow);
581
582 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
583 {
584         unsigned int size = 4;
585
586         clhash->hash = qdisc_class_hash_alloc(size);
587         if (clhash->hash == NULL)
588                 return -ENOMEM;
589         clhash->hashsize  = size;
590         clhash->hashmask  = size - 1;
591         clhash->hashelems = 0;
592         return 0;
593 }
594 EXPORT_SYMBOL(qdisc_class_hash_init);
595
596 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
597 {
598         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599 }
600 EXPORT_SYMBOL(qdisc_class_hash_destroy);
601
602 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
603                              struct Qdisc_class_common *cl)
604 {
605         unsigned int h;
606
607         INIT_HLIST_NODE(&cl->hnode);
608         h = qdisc_class_hash(cl->classid, clhash->hashmask);
609         hlist_add_head(&cl->hnode, &clhash->hash[h]);
610         clhash->hashelems++;
611 }
612 EXPORT_SYMBOL(qdisc_class_hash_insert);
613
614 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
615                              struct Qdisc_class_common *cl)
616 {
617         hlist_del(&cl->hnode);
618         clhash->hashelems--;
619 }
620 EXPORT_SYMBOL(qdisc_class_hash_remove);
621
622 /* Allocate an unique handle from space managed by kernel
623  * Possible range is [8000-FFFF]:0000 (0x8000 values)
624  */
625 static u32 qdisc_alloc_handle(struct net_device *dev)
626 {
627         int i = 0x8000;
628         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629
630         do {
631                 autohandle += TC_H_MAKE(0x10000U, 0);
632                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633                         autohandle = TC_H_MAKE(0x80000000U, 0);
634                 if (!qdisc_lookup(dev, autohandle))
635                         return autohandle;
636                 cond_resched();
637         } while (--i > 0);
638
639         return 0;
640 }
641
642 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643 {
644         const struct Qdisc_class_ops *cops;
645         unsigned long cl;
646         u32 parentid;
647
648         if (n == 0)
649                 return;
650         while ((parentid = sch->parent)) {
651                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652                         return;
653
654                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655                 if (sch == NULL) {
656                         WARN_ON(parentid != TC_H_ROOT);
657                         return;
658                 }
659                 cops = sch->ops->cl_ops;
660                 if (cops->qlen_notify) {
661                         cl = cops->get(sch, parentid);
662                         cops->qlen_notify(sch, cl);
663                         cops->put(sch, cl);
664                 }
665                 sch->q.qlen -= n;
666         }
667 }
668 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
669
670 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671                                struct nlmsghdr *n, u32 clid,
672                                struct Qdisc *old, struct Qdisc *new)
673 {
674         if (new || old)
675                 qdisc_notify(net, skb, n, clid, old, new);
676
677         if (old)
678                 qdisc_destroy(old);
679 }
680
681 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
682  * to device "dev".
683  *
684  * When appropriate send a netlink notification using 'skb'
685  * and "n".
686  *
687  * On success, destroy old qdisc.
688  */
689
690 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692                        struct Qdisc *new, struct Qdisc *old)
693 {
694         struct Qdisc *q = old;
695         struct net *net = dev_net(dev);
696         int err = 0;
697
698         if (parent == NULL) {
699                 unsigned int i, num_q, ingress;
700
701                 ingress = 0;
702                 num_q = dev->num_tx_queues;
703                 if ((q && q->flags & TCQ_F_INGRESS) ||
704                     (new && new->flags & TCQ_F_INGRESS)) {
705                         num_q = 1;
706                         ingress = 1;
707                         if (!dev_ingress_queue(dev))
708                                 return -ENOENT;
709                 }
710
711                 if (dev->flags & IFF_UP)
712                         dev_deactivate(dev);
713
714                 if (new && new->ops->attach) {
715                         new->ops->attach(new);
716                         num_q = 0;
717                 }
718
719                 for (i = 0; i < num_q; i++) {
720                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721
722                         if (!ingress)
723                                 dev_queue = netdev_get_tx_queue(dev, i);
724
725                         old = dev_graft_qdisc(dev_queue, new);
726                         if (new && i > 0)
727                                 atomic_inc(&new->refcnt);
728
729                         if (!ingress)
730                                 qdisc_destroy(old);
731                 }
732
733                 if (!ingress) {
734                         notify_and_destroy(net, skb, n, classid,
735                                            dev->qdisc, new);
736                         if (new && !new->ops->attach)
737                                 atomic_inc(&new->refcnt);
738                         dev->qdisc = new ? : &noop_qdisc;
739                 } else {
740                         notify_and_destroy(net, skb, n, classid, old, new);
741                 }
742
743                 if (dev->flags & IFF_UP)
744                         dev_activate(dev);
745         } else {
746                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747
748                 err = -EOPNOTSUPP;
749                 if (cops && cops->graft) {
750                         unsigned long cl = cops->get(parent, classid);
751                         if (cl) {
752                                 err = cops->graft(parent, cl, new, &old);
753                                 cops->put(parent, cl);
754                         } else
755                                 err = -ENOENT;
756                 }
757                 if (!err)
758                         notify_and_destroy(net, skb, n, classid, old, new);
759         }
760         return err;
761 }
762
763 /* lockdep annotation is needed for ingress; egress gets it only for name */
764 static struct lock_class_key qdisc_tx_lock;
765 static struct lock_class_key qdisc_rx_lock;
766
767 /*
768    Allocate and initialize new qdisc.
769
770    Parameters are passed via opt.
771  */
772
773 static struct Qdisc *
774 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775              struct Qdisc *p, u32 parent, u32 handle,
776              struct nlattr **tca, int *errp)
777 {
778         int err;
779         struct nlattr *kind = tca[TCA_KIND];
780         struct Qdisc *sch;
781         struct Qdisc_ops *ops;
782         struct qdisc_size_table *stab;
783
784         ops = qdisc_lookup_ops(kind);
785 #ifdef CONFIG_MODULES
786         if (ops == NULL && kind != NULL) {
787                 char name[IFNAMSIZ];
788                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789                         /* We dropped the RTNL semaphore in order to
790                          * perform the module load.  So, even if we
791                          * succeeded in loading the module we have to
792                          * tell the caller to replay the request.  We
793                          * indicate this using -EAGAIN.
794                          * We replay the request because the device may
795                          * go away in the mean time.
796                          */
797                         rtnl_unlock();
798                         request_module("sch_%s", name);
799                         rtnl_lock();
800                         ops = qdisc_lookup_ops(kind);
801                         if (ops != NULL) {
802                                 /* We will try again qdisc_lookup_ops,
803                                  * so don't keep a reference.
804                                  */
805                                 module_put(ops->owner);
806                                 err = -EAGAIN;
807                                 goto err_out;
808                         }
809                 }
810         }
811 #endif
812
813         err = -ENOENT;
814         if (ops == NULL)
815                 goto err_out;
816
817         sch = qdisc_alloc(dev_queue, ops);
818         if (IS_ERR(sch)) {
819                 err = PTR_ERR(sch);
820                 goto err_out2;
821         }
822
823         sch->parent = parent;
824
825         if (handle == TC_H_INGRESS) {
826                 sch->flags |= TCQ_F_INGRESS;
827                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
828                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829         } else {
830                 if (handle == 0) {
831                         handle = qdisc_alloc_handle(dev);
832                         err = -ENOMEM;
833                         if (handle == 0)
834                                 goto err_out3;
835                 }
836                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837         }
838
839         sch->handle = handle;
840
841         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
842                 if (tca[TCA_STAB]) {
843                         stab = qdisc_get_stab(tca[TCA_STAB]);
844                         if (IS_ERR(stab)) {
845                                 err = PTR_ERR(stab);
846                                 goto err_out4;
847                         }
848                         rcu_assign_pointer(sch->stab, stab);
849                 }
850                 if (tca[TCA_RATE]) {
851                         spinlock_t *root_lock;
852
853                         err = -EOPNOTSUPP;
854                         if (sch->flags & TCQ_F_MQROOT)
855                                 goto err_out4;
856
857                         if ((sch->parent != TC_H_ROOT) &&
858                             !(sch->flags & TCQ_F_INGRESS) &&
859                             (!p || !(p->flags & TCQ_F_MQROOT)))
860                                 root_lock = qdisc_root_sleeping_lock(sch);
861                         else
862                                 root_lock = qdisc_lock(sch);
863
864                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
865                                                 root_lock, tca[TCA_RATE]);
866                         if (err)
867                                 goto err_out4;
868                 }
869
870                 qdisc_list_add(sch);
871
872                 return sch;
873         }
874 err_out3:
875         dev_put(dev);
876         kfree((char *) sch - sch->padded);
877 err_out2:
878         module_put(ops->owner);
879 err_out:
880         *errp = err;
881         return NULL;
882
883 err_out4:
884         /*
885          * Any broken qdiscs that would require a ops->reset() here?
886          * The qdisc was never in action so it shouldn't be necessary.
887          */
888         qdisc_put_stab(rtnl_dereference(sch->stab));
889         if (ops->destroy)
890                 ops->destroy(sch);
891         goto err_out3;
892 }
893
894 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
895 {
896         struct qdisc_size_table *ostab, *stab = NULL;
897         int err = 0;
898
899         if (tca[TCA_OPTIONS]) {
900                 if (sch->ops->change == NULL)
901                         return -EINVAL;
902                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
903                 if (err)
904                         return err;
905         }
906
907         if (tca[TCA_STAB]) {
908                 stab = qdisc_get_stab(tca[TCA_STAB]);
909                 if (IS_ERR(stab))
910                         return PTR_ERR(stab);
911         }
912
913         ostab = rtnl_dereference(sch->stab);
914         rcu_assign_pointer(sch->stab, stab);
915         qdisc_put_stab(ostab);
916
917         if (tca[TCA_RATE]) {
918                 /* NB: ignores errors from replace_estimator
919                    because change can't be undone. */
920                 if (sch->flags & TCQ_F_MQROOT)
921                         goto out;
922                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
923                                             qdisc_root_sleeping_lock(sch),
924                                             tca[TCA_RATE]);
925         }
926 out:
927         return 0;
928 }
929
930 struct check_loop_arg {
931         struct qdisc_walker     w;
932         struct Qdisc            *p;
933         int                     depth;
934 };
935
936 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
937
938 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
939 {
940         struct check_loop_arg   arg;
941
942         if (q->ops->cl_ops == NULL)
943                 return 0;
944
945         arg.w.stop = arg.w.skip = arg.w.count = 0;
946         arg.w.fn = check_loop_fn;
947         arg.depth = depth;
948         arg.p = p;
949         q->ops->cl_ops->walk(q, &arg.w);
950         return arg.w.stop ? -ELOOP : 0;
951 }
952
953 static int
954 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
955 {
956         struct Qdisc *leaf;
957         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
958         struct check_loop_arg *arg = (struct check_loop_arg *)w;
959
960         leaf = cops->leaf(q, cl);
961         if (leaf) {
962                 if (leaf == arg->p || arg->depth > 7)
963                         return -ELOOP;
964                 return check_loop(leaf, arg->p, arg->depth + 1);
965         }
966         return 0;
967 }
968
969 /*
970  * Delete/get qdisc.
971  */
972
973 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
974 {
975         struct net *net = sock_net(skb->sk);
976         struct tcmsg *tcm = NLMSG_DATA(n);
977         struct nlattr *tca[TCA_MAX + 1];
978         struct net_device *dev;
979         u32 clid = tcm->tcm_parent;
980         struct Qdisc *q = NULL;
981         struct Qdisc *p = NULL;
982         int err;
983
984         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
985         if (!dev)
986                 return -ENODEV;
987
988         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
989         if (err < 0)
990                 return err;
991
992         if (clid) {
993                 if (clid != TC_H_ROOT) {
994                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
995                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
996                                 if (!p)
997                                         return -ENOENT;
998                                 q = qdisc_leaf(p, clid);
999                         } else if (dev_ingress_queue(dev)) {
1000                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1001                         }
1002                 } else {
1003                         q = dev->qdisc;
1004                 }
1005                 if (!q)
1006                         return -ENOENT;
1007
1008                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1009                         return -EINVAL;
1010         } else {
1011                 q = qdisc_lookup(dev, tcm->tcm_handle);
1012                 if (!q)
1013                         return -ENOENT;
1014         }
1015
1016         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1017                 return -EINVAL;
1018
1019         if (n->nlmsg_type == RTM_DELQDISC) {
1020                 if (!clid)
1021                         return -EINVAL;
1022                 if (q->handle == 0)
1023                         return -ENOENT;
1024                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1025                 if (err != 0)
1026                         return err;
1027         } else {
1028                 qdisc_notify(net, skb, n, clid, NULL, q);
1029         }
1030         return 0;
1031 }
1032
1033 /*
1034  * Create/change qdisc.
1035  */
1036
1037 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1038 {
1039         struct net *net = sock_net(skb->sk);
1040         struct tcmsg *tcm;
1041         struct nlattr *tca[TCA_MAX + 1];
1042         struct net_device *dev;
1043         u32 clid;
1044         struct Qdisc *q, *p;
1045         int err;
1046
1047 replay:
1048         /* Reinit, just in case something touches this. */
1049         tcm = NLMSG_DATA(n);
1050         clid = tcm->tcm_parent;
1051         q = p = NULL;
1052
1053         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1054         if (!dev)
1055                 return -ENODEV;
1056
1057         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1058         if (err < 0)
1059                 return err;
1060
1061         if (clid) {
1062                 if (clid != TC_H_ROOT) {
1063                         if (clid != TC_H_INGRESS) {
1064                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1065                                 if (!p)
1066                                         return -ENOENT;
1067                                 q = qdisc_leaf(p, clid);
1068                         } else if (dev_ingress_queue_create(dev)) {
1069                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1070                         }
1071                 } else {
1072                         q = dev->qdisc;
1073                 }
1074
1075                 /* It may be default qdisc, ignore it */
1076                 if (q && q->handle == 0)
1077                         q = NULL;
1078
1079                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1080                         if (tcm->tcm_handle) {
1081                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1082                                         return -EEXIST;
1083                                 if (TC_H_MIN(tcm->tcm_handle))
1084                                         return -EINVAL;
1085                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1086                                 if (!q)
1087                                         goto create_n_graft;
1088                                 if (n->nlmsg_flags & NLM_F_EXCL)
1089                                         return -EEXIST;
1090                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1091                                         return -EINVAL;
1092                                 if (q == p ||
1093                                     (p && check_loop(q, p, 0)))
1094                                         return -ELOOP;
1095                                 atomic_inc(&q->refcnt);
1096                                 goto graft;
1097                         } else {
1098                                 if (!q)
1099                                         goto create_n_graft;
1100
1101                                 /* This magic test requires explanation.
1102                                  *
1103                                  *   We know, that some child q is already
1104                                  *   attached to this parent and have choice:
1105                                  *   either to change it or to create/graft new one.
1106                                  *
1107                                  *   1. We are allowed to create/graft only
1108                                  *   if CREATE and REPLACE flags are set.
1109                                  *
1110                                  *   2. If EXCL is set, requestor wanted to say,
1111                                  *   that qdisc tcm_handle is not expected
1112                                  *   to exist, so that we choose create/graft too.
1113                                  *
1114                                  *   3. The last case is when no flags are set.
1115                                  *   Alas, it is sort of hole in API, we
1116                                  *   cannot decide what to do unambiguously.
1117                                  *   For now we select create/graft, if
1118                                  *   user gave KIND, which does not match existing.
1119                                  */
1120                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1121                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1122                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1123                                      (tca[TCA_KIND] &&
1124                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1125                                         goto create_n_graft;
1126                         }
1127                 }
1128         } else {
1129                 if (!tcm->tcm_handle)
1130                         return -EINVAL;
1131                 q = qdisc_lookup(dev, tcm->tcm_handle);
1132         }
1133
1134         /* Change qdisc parameters */
1135         if (q == NULL)
1136                 return -ENOENT;
1137         if (n->nlmsg_flags & NLM_F_EXCL)
1138                 return -EEXIST;
1139         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1140                 return -EINVAL;
1141         err = qdisc_change(q, tca);
1142         if (err == 0)
1143                 qdisc_notify(net, skb, n, clid, NULL, q);
1144         return err;
1145
1146 create_n_graft:
1147         if (!(n->nlmsg_flags & NLM_F_CREATE))
1148                 return -ENOENT;
1149         if (clid == TC_H_INGRESS) {
1150                 if (dev_ingress_queue(dev))
1151                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1152                                          tcm->tcm_parent, tcm->tcm_parent,
1153                                          tca, &err);
1154                 else
1155                         err = -ENOENT;
1156         } else {
1157                 struct netdev_queue *dev_queue;
1158
1159                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1160                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1161                 else if (p)
1162                         dev_queue = p->dev_queue;
1163                 else
1164                         dev_queue = netdev_get_tx_queue(dev, 0);
1165
1166                 q = qdisc_create(dev, dev_queue, p,
1167                                  tcm->tcm_parent, tcm->tcm_handle,
1168                                  tca, &err);
1169         }
1170         if (q == NULL) {
1171                 if (err == -EAGAIN)
1172                         goto replay;
1173                 return err;
1174         }
1175
1176 graft:
1177         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1178         if (err) {
1179                 if (q)
1180                         qdisc_destroy(q);
1181                 return err;
1182         }
1183
1184         return 0;
1185 }
1186
1187 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1188                          u32 pid, u32 seq, u16 flags, int event)
1189 {
1190         struct tcmsg *tcm;
1191         struct nlmsghdr  *nlh;
1192         unsigned char *b = skb_tail_pointer(skb);
1193         struct gnet_dump d;
1194         struct qdisc_size_table *stab;
1195
1196         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1197         tcm = NLMSG_DATA(nlh);
1198         tcm->tcm_family = AF_UNSPEC;
1199         tcm->tcm__pad1 = 0;
1200         tcm->tcm__pad2 = 0;
1201         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1202         tcm->tcm_parent = clid;
1203         tcm->tcm_handle = q->handle;
1204         tcm->tcm_info = atomic_read(&q->refcnt);
1205         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1206                 goto nla_put_failure;
1207         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1208                 goto nla_put_failure;
1209         q->qstats.qlen = q->q.qlen;
1210
1211         stab = rtnl_dereference(q->stab);
1212         if (stab && qdisc_dump_stab(skb, stab) < 0)
1213                 goto nla_put_failure;
1214
1215         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1216                                          qdisc_root_sleeping_lock(q), &d) < 0)
1217                 goto nla_put_failure;
1218
1219         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1220                 goto nla_put_failure;
1221
1222         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1223             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1224             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1225                 goto nla_put_failure;
1226
1227         if (gnet_stats_finish_copy(&d) < 0)
1228                 goto nla_put_failure;
1229
1230         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1231         return skb->len;
1232
1233 nlmsg_failure:
1234 nla_put_failure:
1235         nlmsg_trim(skb, b);
1236         return -1;
1237 }
1238
1239 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1240 {
1241         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1242 }
1243
1244 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1245                         struct nlmsghdr *n, u32 clid,
1246                         struct Qdisc *old, struct Qdisc *new)
1247 {
1248         struct sk_buff *skb;
1249         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1250
1251         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1252         if (!skb)
1253                 return -ENOBUFS;
1254
1255         if (old && !tc_qdisc_dump_ignore(old)) {
1256                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1257                                   0, RTM_DELQDISC) < 0)
1258                         goto err_out;
1259         }
1260         if (new && !tc_qdisc_dump_ignore(new)) {
1261                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1262                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1263                         goto err_out;
1264         }
1265
1266         if (skb->len)
1267                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1268                                       n->nlmsg_flags & NLM_F_ECHO);
1269
1270 err_out:
1271         kfree_skb(skb);
1272         return -EINVAL;
1273 }
1274
1275 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1276                               struct netlink_callback *cb,
1277                               int *q_idx_p, int s_q_idx)
1278 {
1279         int ret = 0, q_idx = *q_idx_p;
1280         struct Qdisc *q;
1281
1282         if (!root)
1283                 return 0;
1284
1285         q = root;
1286         if (q_idx < s_q_idx) {
1287                 q_idx++;
1288         } else {
1289                 if (!tc_qdisc_dump_ignore(q) &&
1290                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1291                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1292                         goto done;
1293                 q_idx++;
1294         }
1295         list_for_each_entry(q, &root->list, list) {
1296                 if (q_idx < s_q_idx) {
1297                         q_idx++;
1298                         continue;
1299                 }
1300                 if (!tc_qdisc_dump_ignore(q) &&
1301                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1302                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1303                         goto done;
1304                 q_idx++;
1305         }
1306
1307 out:
1308         *q_idx_p = q_idx;
1309         return ret;
1310 done:
1311         ret = -1;
1312         goto out;
1313 }
1314
1315 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1316 {
1317         struct net *net = sock_net(skb->sk);
1318         int idx, q_idx;
1319         int s_idx, s_q_idx;
1320         struct net_device *dev;
1321
1322         s_idx = cb->args[0];
1323         s_q_idx = q_idx = cb->args[1];
1324
1325         rcu_read_lock();
1326         idx = 0;
1327         for_each_netdev_rcu(net, dev) {
1328                 struct netdev_queue *dev_queue;
1329
1330                 if (idx < s_idx)
1331                         goto cont;
1332                 if (idx > s_idx)
1333                         s_q_idx = 0;
1334                 q_idx = 0;
1335
1336                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1337                         goto done;
1338
1339                 dev_queue = dev_ingress_queue(dev);
1340                 if (dev_queue &&
1341                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1342                                        &q_idx, s_q_idx) < 0)
1343                         goto done;
1344
1345 cont:
1346                 idx++;
1347         }
1348
1349 done:
1350         rcu_read_unlock();
1351
1352         cb->args[0] = idx;
1353         cb->args[1] = q_idx;
1354
1355         return skb->len;
1356 }
1357
1358
1359
1360 /************************************************
1361  *      Traffic classes manipulation.           *
1362  ************************************************/
1363
1364
1365
1366 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1367 {
1368         struct net *net = sock_net(skb->sk);
1369         struct tcmsg *tcm = NLMSG_DATA(n);
1370         struct nlattr *tca[TCA_MAX + 1];
1371         struct net_device *dev;
1372         struct Qdisc *q = NULL;
1373         const struct Qdisc_class_ops *cops;
1374         unsigned long cl = 0;
1375         unsigned long new_cl;
1376         u32 pid = tcm->tcm_parent;
1377         u32 clid = tcm->tcm_handle;
1378         u32 qid = TC_H_MAJ(clid);
1379         int err;
1380
1381         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1382         if (!dev)
1383                 return -ENODEV;
1384
1385         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1386         if (err < 0)
1387                 return err;
1388
1389         /*
1390            parent == TC_H_UNSPEC - unspecified parent.
1391            parent == TC_H_ROOT   - class is root, which has no parent.
1392            parent == X:0         - parent is root class.
1393            parent == X:Y         - parent is a node in hierarchy.
1394            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1395
1396            handle == 0:0         - generate handle from kernel pool.
1397            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1398            handle == X:Y         - clear.
1399            handle == X:0         - root class.
1400          */
1401
1402         /* Step 1. Determine qdisc handle X:0 */
1403
1404         if (pid != TC_H_ROOT) {
1405                 u32 qid1 = TC_H_MAJ(pid);
1406
1407                 if (qid && qid1) {
1408                         /* If both majors are known, they must be identical. */
1409                         if (qid != qid1)
1410                                 return -EINVAL;
1411                 } else if (qid1) {
1412                         qid = qid1;
1413                 } else if (qid == 0)
1414                         qid = dev->qdisc->handle;
1415
1416                 /* Now qid is genuine qdisc handle consistent
1417                  * both with parent and child.
1418                  *
1419                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1420                  */
1421                 if (pid)
1422                         pid = TC_H_MAKE(qid, pid);
1423         } else {
1424                 if (qid == 0)
1425                         qid = dev->qdisc->handle;
1426         }
1427
1428         /* OK. Locate qdisc */
1429         q = qdisc_lookup(dev, qid);
1430         if (!q)
1431                 return -ENOENT;
1432
1433         /* An check that it supports classes */
1434         cops = q->ops->cl_ops;
1435         if (cops == NULL)
1436                 return -EINVAL;
1437
1438         /* Now try to get class */
1439         if (clid == 0) {
1440                 if (pid == TC_H_ROOT)
1441                         clid = qid;
1442         } else
1443                 clid = TC_H_MAKE(qid, clid);
1444
1445         if (clid)
1446                 cl = cops->get(q, clid);
1447
1448         if (cl == 0) {
1449                 err = -ENOENT;
1450                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1451                     !(n->nlmsg_flags & NLM_F_CREATE))
1452                         goto out;
1453         } else {
1454                 switch (n->nlmsg_type) {
1455                 case RTM_NEWTCLASS:
1456                         err = -EEXIST;
1457                         if (n->nlmsg_flags & NLM_F_EXCL)
1458                                 goto out;
1459                         break;
1460                 case RTM_DELTCLASS:
1461                         err = -EOPNOTSUPP;
1462                         if (cops->delete)
1463                                 err = cops->delete(q, cl);
1464                         if (err == 0)
1465                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1466                         goto out;
1467                 case RTM_GETTCLASS:
1468                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1469                         goto out;
1470                 default:
1471                         err = -EINVAL;
1472                         goto out;
1473                 }
1474         }
1475
1476         new_cl = cl;
1477         err = -EOPNOTSUPP;
1478         if (cops->change)
1479                 err = cops->change(q, clid, pid, tca, &new_cl);
1480         if (err == 0)
1481                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1482
1483 out:
1484         if (cl)
1485                 cops->put(q, cl);
1486
1487         return err;
1488 }
1489
1490
1491 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1492                           unsigned long cl,
1493                           u32 pid, u32 seq, u16 flags, int event)
1494 {
1495         struct tcmsg *tcm;
1496         struct nlmsghdr  *nlh;
1497         unsigned char *b = skb_tail_pointer(skb);
1498         struct gnet_dump d;
1499         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1500
1501         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1502         tcm = NLMSG_DATA(nlh);
1503         tcm->tcm_family = AF_UNSPEC;
1504         tcm->tcm__pad1 = 0;
1505         tcm->tcm__pad2 = 0;
1506         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1507         tcm->tcm_parent = q->handle;
1508         tcm->tcm_handle = q->handle;
1509         tcm->tcm_info = 0;
1510         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1511                 goto nla_put_failure;
1512         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1513                 goto nla_put_failure;
1514
1515         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1516                                          qdisc_root_sleeping_lock(q), &d) < 0)
1517                 goto nla_put_failure;
1518
1519         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1520                 goto nla_put_failure;
1521
1522         if (gnet_stats_finish_copy(&d) < 0)
1523                 goto nla_put_failure;
1524
1525         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1526         return skb->len;
1527
1528 nlmsg_failure:
1529 nla_put_failure:
1530         nlmsg_trim(skb, b);
1531         return -1;
1532 }
1533
1534 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1535                          struct nlmsghdr *n, struct Qdisc *q,
1536                          unsigned long cl, int event)
1537 {
1538         struct sk_buff *skb;
1539         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1540
1541         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1542         if (!skb)
1543                 return -ENOBUFS;
1544
1545         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1546                 kfree_skb(skb);
1547                 return -EINVAL;
1548         }
1549
1550         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1551                               n->nlmsg_flags & NLM_F_ECHO);
1552 }
1553
1554 struct qdisc_dump_args {
1555         struct qdisc_walker     w;
1556         struct sk_buff          *skb;
1557         struct netlink_callback *cb;
1558 };
1559
1560 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1561 {
1562         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1563
1564         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1565                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1566 }
1567
1568 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1569                                 struct tcmsg *tcm, struct netlink_callback *cb,
1570                                 int *t_p, int s_t)
1571 {
1572         struct qdisc_dump_args arg;
1573
1574         if (tc_qdisc_dump_ignore(q) ||
1575             *t_p < s_t || !q->ops->cl_ops ||
1576             (tcm->tcm_parent &&
1577              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1578                 (*t_p)++;
1579                 return 0;
1580         }
1581         if (*t_p > s_t)
1582                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1583         arg.w.fn = qdisc_class_dump;
1584         arg.skb = skb;
1585         arg.cb = cb;
1586         arg.w.stop  = 0;
1587         arg.w.skip = cb->args[1];
1588         arg.w.count = 0;
1589         q->ops->cl_ops->walk(q, &arg.w);
1590         cb->args[1] = arg.w.count;
1591         if (arg.w.stop)
1592                 return -1;
1593         (*t_p)++;
1594         return 0;
1595 }
1596
1597 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1598                                struct tcmsg *tcm, struct netlink_callback *cb,
1599                                int *t_p, int s_t)
1600 {
1601         struct Qdisc *q;
1602
1603         if (!root)
1604                 return 0;
1605
1606         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1607                 return -1;
1608
1609         list_for_each_entry(q, &root->list, list) {
1610                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1611                         return -1;
1612         }
1613
1614         return 0;
1615 }
1616
1617 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1618 {
1619         struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1620         struct net *net = sock_net(skb->sk);
1621         struct netdev_queue *dev_queue;
1622         struct net_device *dev;
1623         int t, s_t;
1624
1625         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1626                 return 0;
1627         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1628         if (!dev)
1629                 return 0;
1630
1631         s_t = cb->args[0];
1632         t = 0;
1633
1634         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1635                 goto done;
1636
1637         dev_queue = dev_ingress_queue(dev);
1638         if (dev_queue &&
1639             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1640                                 &t, s_t) < 0)
1641                 goto done;
1642
1643 done:
1644         cb->args[0] = t;
1645
1646         dev_put(dev);
1647         return skb->len;
1648 }
1649
1650 /* Main classifier routine: scans classifier chain attached
1651  * to this qdisc, (optionally) tests for protocol and asks
1652  * specific classifiers.
1653  */
1654 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1655                        struct tcf_result *res)
1656 {
1657         __be16 protocol = skb->protocol;
1658         int err;
1659
1660         for (; tp; tp = tp->next) {
1661                 if (tp->protocol != protocol &&
1662                     tp->protocol != htons(ETH_P_ALL))
1663                         continue;
1664                 err = tp->classify(skb, tp, res);
1665
1666                 if (err >= 0) {
1667 #ifdef CONFIG_NET_CLS_ACT
1668                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1669                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1670 #endif
1671                         return err;
1672                 }
1673         }
1674         return -1;
1675 }
1676 EXPORT_SYMBOL(tc_classify_compat);
1677
1678 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1679                 struct tcf_result *res)
1680 {
1681         int err = 0;
1682 #ifdef CONFIG_NET_CLS_ACT
1683         const struct tcf_proto *otp = tp;
1684 reclassify:
1685 #endif
1686
1687         err = tc_classify_compat(skb, tp, res);
1688 #ifdef CONFIG_NET_CLS_ACT
1689         if (err == TC_ACT_RECLASSIFY) {
1690                 u32 verd = G_TC_VERD(skb->tc_verd);
1691                 tp = otp;
1692
1693                 if (verd++ >= MAX_REC_LOOP) {
1694                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1695                                                tp->q->ops->id,
1696                                                tp->prio & 0xffff,
1697                                                ntohs(tp->protocol));
1698                         return TC_ACT_SHOT;
1699                 }
1700                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1701                 goto reclassify;
1702         }
1703 #endif
1704         return err;
1705 }
1706 EXPORT_SYMBOL(tc_classify);
1707
1708 void tcf_destroy(struct tcf_proto *tp)
1709 {
1710         tp->ops->destroy(tp);
1711         module_put(tp->ops->owner);
1712         kfree(tp);
1713 }
1714
1715 void tcf_destroy_chain(struct tcf_proto **fl)
1716 {
1717         struct tcf_proto *tp;
1718
1719         while ((tp = *fl) != NULL) {
1720                 *fl = tp->next;
1721                 tcf_destroy(tp);
1722         }
1723 }
1724 EXPORT_SYMBOL(tcf_destroy_chain);
1725
1726 #ifdef CONFIG_PROC_FS
1727 static int psched_show(struct seq_file *seq, void *v)
1728 {
1729         struct timespec ts;
1730
1731         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1732         seq_printf(seq, "%08x %08x %08x %08x\n",
1733                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1734                    1000000,
1735                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1736
1737         return 0;
1738 }
1739
1740 static int psched_open(struct inode *inode, struct file *file)
1741 {
1742         return single_open(file, psched_show, NULL);
1743 }
1744
1745 static const struct file_operations psched_fops = {
1746         .owner = THIS_MODULE,
1747         .open = psched_open,
1748         .read  = seq_read,
1749         .llseek = seq_lseek,
1750         .release = single_release,
1751 };
1752
1753 static int __net_init psched_net_init(struct net *net)
1754 {
1755         struct proc_dir_entry *e;
1756
1757         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1758         if (e == NULL)
1759                 return -ENOMEM;
1760
1761         return 0;
1762 }
1763
1764 static void __net_exit psched_net_exit(struct net *net)
1765 {
1766         proc_net_remove(net, "psched");
1767 }
1768 #else
1769 static int __net_init psched_net_init(struct net *net)
1770 {
1771         return 0;
1772 }
1773
1774 static void __net_exit psched_net_exit(struct net *net)
1775 {
1776 }
1777 #endif
1778
1779 static struct pernet_operations psched_net_ops = {
1780         .init = psched_net_init,
1781         .exit = psched_net_exit,
1782 };
1783
1784 static int __init pktsched_init(void)
1785 {
1786         int err;
1787
1788         err = register_pernet_subsys(&psched_net_ops);
1789         if (err) {
1790                 pr_err("pktsched_init: "
1791                        "cannot initialize per netns operations\n");
1792                 return err;
1793         }
1794
1795         register_qdisc(&pfifo_qdisc_ops);
1796         register_qdisc(&bfifo_qdisc_ops);
1797         register_qdisc(&pfifo_head_drop_qdisc_ops);
1798         register_qdisc(&mq_qdisc_ops);
1799
1800         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1801         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1802         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1803         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1804         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1805         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1806
1807         return 0;
1808 }
1809
1810 subsys_initcall(pktsched_init);