ipsec: Fix aborted xfrm policy dump crash
[pandora-kernel.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
430         nla_nest_end(skb, nest);
431
432         return skb->len;
433
434 nla_put_failure:
435         return -1;
436 }
437
438 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
439 {
440         int pkt_len, slot;
441
442         pkt_len = skb->len + stab->szopts.overhead;
443         if (unlikely(!stab->szopts.tsize))
444                 goto out;
445
446         slot = pkt_len + stab->szopts.cell_align;
447         if (unlikely(slot < 0))
448                 slot = 0;
449
450         slot >>= stab->szopts.cell_log;
451         if (likely(slot < stab->szopts.tsize))
452                 pkt_len = stab->data[slot];
453         else
454                 pkt_len = stab->data[stab->szopts.tsize - 1] *
455                                 (slot / stab->szopts.tsize) +
456                                 stab->data[slot % stab->szopts.tsize];
457
458         pkt_len <<= stab->szopts.size_log;
459 out:
460         if (unlikely(pkt_len < 1))
461                 pkt_len = 1;
462         qdisc_skb_cb(skb)->pkt_len = pkt_len;
463 }
464 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
465
466 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
467 {
468         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
469                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
470                         txt, qdisc->ops->id, qdisc->handle >> 16);
471                 qdisc->flags |= TCQ_F_WARN_NONWC;
472         }
473 }
474 EXPORT_SYMBOL(qdisc_warn_nonwc);
475
476 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
477 {
478         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
479                                                  timer);
480
481         qdisc_unthrottled(wd->qdisc);
482         __netif_schedule(qdisc_root(wd->qdisc));
483
484         return HRTIMER_NORESTART;
485 }
486
487 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
488 {
489         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
490         wd->timer.function = qdisc_watchdog;
491         wd->qdisc = qdisc;
492 }
493 EXPORT_SYMBOL(qdisc_watchdog_init);
494
495 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
496 {
497         ktime_t time;
498
499         if (test_bit(__QDISC_STATE_DEACTIVATED,
500                      &qdisc_root_sleeping(wd->qdisc)->state))
501                 return;
502
503         qdisc_throttled(wd->qdisc);
504         time = ktime_set(0, 0);
505         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
506         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
507 }
508 EXPORT_SYMBOL(qdisc_watchdog_schedule);
509
510 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511 {
512         hrtimer_cancel(&wd->timer);
513         qdisc_unthrottled(wd->qdisc);
514 }
515 EXPORT_SYMBOL(qdisc_watchdog_cancel);
516
517 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518 {
519         unsigned int size = n * sizeof(struct hlist_head), i;
520         struct hlist_head *h;
521
522         if (size <= PAGE_SIZE)
523                 h = kmalloc(size, GFP_KERNEL);
524         else
525                 h = (struct hlist_head *)
526                         __get_free_pages(GFP_KERNEL, get_order(size));
527
528         if (h != NULL) {
529                 for (i = 0; i < n; i++)
530                         INIT_HLIST_HEAD(&h[i]);
531         }
532         return h;
533 }
534
535 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536 {
537         unsigned int size = n * sizeof(struct hlist_head);
538
539         if (size <= PAGE_SIZE)
540                 kfree(h);
541         else
542                 free_pages((unsigned long)h, get_order(size));
543 }
544
545 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546 {
547         struct Qdisc_class_common *cl;
548         struct hlist_node *n, *next;
549         struct hlist_head *nhash, *ohash;
550         unsigned int nsize, nmask, osize;
551         unsigned int i, h;
552
553         /* Rehash when load factor exceeds 0.75 */
554         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555                 return;
556         nsize = clhash->hashsize * 2;
557         nmask = nsize - 1;
558         nhash = qdisc_class_hash_alloc(nsize);
559         if (nhash == NULL)
560                 return;
561
562         ohash = clhash->hash;
563         osize = clhash->hashsize;
564
565         sch_tree_lock(sch);
566         for (i = 0; i < osize; i++) {
567                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568                         h = qdisc_class_hash(cl->classid, nmask);
569                         hlist_add_head(&cl->hnode, &nhash[h]);
570                 }
571         }
572         clhash->hash     = nhash;
573         clhash->hashsize = nsize;
574         clhash->hashmask = nmask;
575         sch_tree_unlock(sch);
576
577         qdisc_class_hash_free(ohash, osize);
578 }
579 EXPORT_SYMBOL(qdisc_class_hash_grow);
580
581 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582 {
583         unsigned int size = 4;
584
585         clhash->hash = qdisc_class_hash_alloc(size);
586         if (clhash->hash == NULL)
587                 return -ENOMEM;
588         clhash->hashsize  = size;
589         clhash->hashmask  = size - 1;
590         clhash->hashelems = 0;
591         return 0;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_init);
594
595 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596 {
597         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598 }
599 EXPORT_SYMBOL(qdisc_class_hash_destroy);
600
601 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602                              struct Qdisc_class_common *cl)
603 {
604         unsigned int h;
605
606         INIT_HLIST_NODE(&cl->hnode);
607         h = qdisc_class_hash(cl->classid, clhash->hashmask);
608         hlist_add_head(&cl->hnode, &clhash->hash[h]);
609         clhash->hashelems++;
610 }
611 EXPORT_SYMBOL(qdisc_class_hash_insert);
612
613 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614                              struct Qdisc_class_common *cl)
615 {
616         hlist_del(&cl->hnode);
617         clhash->hashelems--;
618 }
619 EXPORT_SYMBOL(qdisc_class_hash_remove);
620
621 /* Allocate an unique handle from space managed by kernel */
622
623 static u32 qdisc_alloc_handle(struct net_device *dev)
624 {
625         int i = 0x10000;
626         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
627
628         do {
629                 autohandle += TC_H_MAKE(0x10000U, 0);
630                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
631                         autohandle = TC_H_MAKE(0x80000000U, 0);
632         } while (qdisc_lookup(dev, autohandle) && --i > 0);
633
634         return i > 0 ? autohandle : 0;
635 }
636
637 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
638 {
639         const struct Qdisc_class_ops *cops;
640         unsigned long cl;
641         u32 parentid;
642
643         if (n == 0)
644                 return;
645         while ((parentid = sch->parent)) {
646                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
647                         return;
648
649                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
650                 if (sch == NULL) {
651                         WARN_ON(parentid != TC_H_ROOT);
652                         return;
653                 }
654                 cops = sch->ops->cl_ops;
655                 if (cops->qlen_notify) {
656                         cl = cops->get(sch, parentid);
657                         cops->qlen_notify(sch, cl);
658                         cops->put(sch, cl);
659                 }
660                 sch->q.qlen -= n;
661         }
662 }
663 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
664
665 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
666                                struct nlmsghdr *n, u32 clid,
667                                struct Qdisc *old, struct Qdisc *new)
668 {
669         if (new || old)
670                 qdisc_notify(net, skb, n, clid, old, new);
671
672         if (old)
673                 qdisc_destroy(old);
674 }
675
676 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
677  * to device "dev".
678  *
679  * When appropriate send a netlink notification using 'skb'
680  * and "n".
681  *
682  * On success, destroy old qdisc.
683  */
684
685 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
686                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
687                        struct Qdisc *new, struct Qdisc *old)
688 {
689         struct Qdisc *q = old;
690         struct net *net = dev_net(dev);
691         int err = 0;
692
693         if (parent == NULL) {
694                 unsigned int i, num_q, ingress;
695
696                 ingress = 0;
697                 num_q = dev->num_tx_queues;
698                 if ((q && q->flags & TCQ_F_INGRESS) ||
699                     (new && new->flags & TCQ_F_INGRESS)) {
700                         num_q = 1;
701                         ingress = 1;
702                         if (!dev_ingress_queue(dev))
703                                 return -ENOENT;
704                 }
705
706                 if (dev->flags & IFF_UP)
707                         dev_deactivate(dev);
708
709                 if (new && new->ops->attach) {
710                         new->ops->attach(new);
711                         num_q = 0;
712                 }
713
714                 for (i = 0; i < num_q; i++) {
715                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
716
717                         if (!ingress)
718                                 dev_queue = netdev_get_tx_queue(dev, i);
719
720                         old = dev_graft_qdisc(dev_queue, new);
721                         if (new && i > 0)
722                                 atomic_inc(&new->refcnt);
723
724                         if (!ingress)
725                                 qdisc_destroy(old);
726                 }
727
728                 if (!ingress) {
729                         notify_and_destroy(net, skb, n, classid,
730                                            dev->qdisc, new);
731                         if (new && !new->ops->attach)
732                                 atomic_inc(&new->refcnt);
733                         dev->qdisc = new ? : &noop_qdisc;
734                 } else {
735                         notify_and_destroy(net, skb, n, classid, old, new);
736                 }
737
738                 if (dev->flags & IFF_UP)
739                         dev_activate(dev);
740         } else {
741                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
742
743                 err = -EOPNOTSUPP;
744                 if (cops && cops->graft) {
745                         unsigned long cl = cops->get(parent, classid);
746                         if (cl) {
747                                 err = cops->graft(parent, cl, new, &old);
748                                 cops->put(parent, cl);
749                         } else
750                                 err = -ENOENT;
751                 }
752                 if (!err)
753                         notify_and_destroy(net, skb, n, classid, old, new);
754         }
755         return err;
756 }
757
758 /* lockdep annotation is needed for ingress; egress gets it only for name */
759 static struct lock_class_key qdisc_tx_lock;
760 static struct lock_class_key qdisc_rx_lock;
761
762 /*
763    Allocate and initialize new qdisc.
764
765    Parameters are passed via opt.
766  */
767
768 static struct Qdisc *
769 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
770              struct Qdisc *p, u32 parent, u32 handle,
771              struct nlattr **tca, int *errp)
772 {
773         int err;
774         struct nlattr *kind = tca[TCA_KIND];
775         struct Qdisc *sch;
776         struct Qdisc_ops *ops;
777         struct qdisc_size_table *stab;
778
779         ops = qdisc_lookup_ops(kind);
780 #ifdef CONFIG_MODULES
781         if (ops == NULL && kind != NULL) {
782                 char name[IFNAMSIZ];
783                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
784                         /* We dropped the RTNL semaphore in order to
785                          * perform the module load.  So, even if we
786                          * succeeded in loading the module we have to
787                          * tell the caller to replay the request.  We
788                          * indicate this using -EAGAIN.
789                          * We replay the request because the device may
790                          * go away in the mean time.
791                          */
792                         rtnl_unlock();
793                         request_module("sch_%s", name);
794                         rtnl_lock();
795                         ops = qdisc_lookup_ops(kind);
796                         if (ops != NULL) {
797                                 /* We will try again qdisc_lookup_ops,
798                                  * so don't keep a reference.
799                                  */
800                                 module_put(ops->owner);
801                                 err = -EAGAIN;
802                                 goto err_out;
803                         }
804                 }
805         }
806 #endif
807
808         err = -ENOENT;
809         if (ops == NULL)
810                 goto err_out;
811
812         sch = qdisc_alloc(dev_queue, ops);
813         if (IS_ERR(sch)) {
814                 err = PTR_ERR(sch);
815                 goto err_out2;
816         }
817
818         sch->parent = parent;
819
820         if (handle == TC_H_INGRESS) {
821                 sch->flags |= TCQ_F_INGRESS;
822                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
823                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
824         } else {
825                 if (handle == 0) {
826                         handle = qdisc_alloc_handle(dev);
827                         err = -ENOMEM;
828                         if (handle == 0)
829                                 goto err_out3;
830                 }
831                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
832         }
833
834         sch->handle = handle;
835
836         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
837                 if (tca[TCA_STAB]) {
838                         stab = qdisc_get_stab(tca[TCA_STAB]);
839                         if (IS_ERR(stab)) {
840                                 err = PTR_ERR(stab);
841                                 goto err_out4;
842                         }
843                         rcu_assign_pointer(sch->stab, stab);
844                 }
845                 if (tca[TCA_RATE]) {
846                         spinlock_t *root_lock;
847
848                         err = -EOPNOTSUPP;
849                         if (sch->flags & TCQ_F_MQROOT)
850                                 goto err_out4;
851
852                         if ((sch->parent != TC_H_ROOT) &&
853                             !(sch->flags & TCQ_F_INGRESS) &&
854                             (!p || !(p->flags & TCQ_F_MQROOT)))
855                                 root_lock = qdisc_root_sleeping_lock(sch);
856                         else
857                                 root_lock = qdisc_lock(sch);
858
859                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
860                                                 root_lock, tca[TCA_RATE]);
861                         if (err)
862                                 goto err_out4;
863                 }
864
865                 qdisc_list_add(sch);
866
867                 return sch;
868         }
869         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
870         ops->destroy(sch);
871 err_out3:
872         dev_put(dev);
873         kfree((char *) sch - sch->padded);
874 err_out2:
875         module_put(ops->owner);
876 err_out:
877         *errp = err;
878         return NULL;
879
880 err_out4:
881         /*
882          * Any broken qdiscs that would require a ops->reset() here?
883          * The qdisc was never in action so it shouldn't be necessary.
884          */
885         qdisc_put_stab(rtnl_dereference(sch->stab));
886         if (ops->destroy)
887                 ops->destroy(sch);
888         goto err_out3;
889 }
890
891 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
892 {
893         struct qdisc_size_table *ostab, *stab = NULL;
894         int err = 0;
895
896         if (tca[TCA_OPTIONS]) {
897                 if (sch->ops->change == NULL)
898                         return -EINVAL;
899                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
900                 if (err)
901                         return err;
902         }
903
904         if (tca[TCA_STAB]) {
905                 stab = qdisc_get_stab(tca[TCA_STAB]);
906                 if (IS_ERR(stab))
907                         return PTR_ERR(stab);
908         }
909
910         ostab = rtnl_dereference(sch->stab);
911         rcu_assign_pointer(sch->stab, stab);
912         qdisc_put_stab(ostab);
913
914         if (tca[TCA_RATE]) {
915                 /* NB: ignores errors from replace_estimator
916                    because change can't be undone. */
917                 if (sch->flags & TCQ_F_MQROOT)
918                         goto out;
919                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
920                                             qdisc_root_sleeping_lock(sch),
921                                             tca[TCA_RATE]);
922         }
923 out:
924         return 0;
925 }
926
927 struct check_loop_arg {
928         struct qdisc_walker     w;
929         struct Qdisc            *p;
930         int                     depth;
931 };
932
933 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
934
935 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
936 {
937         struct check_loop_arg   arg;
938
939         if (q->ops->cl_ops == NULL)
940                 return 0;
941
942         arg.w.stop = arg.w.skip = arg.w.count = 0;
943         arg.w.fn = check_loop_fn;
944         arg.depth = depth;
945         arg.p = p;
946         q->ops->cl_ops->walk(q, &arg.w);
947         return arg.w.stop ? -ELOOP : 0;
948 }
949
950 static int
951 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
952 {
953         struct Qdisc *leaf;
954         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
955         struct check_loop_arg *arg = (struct check_loop_arg *)w;
956
957         leaf = cops->leaf(q, cl);
958         if (leaf) {
959                 if (leaf == arg->p || arg->depth > 7)
960                         return -ELOOP;
961                 return check_loop(leaf, arg->p, arg->depth + 1);
962         }
963         return 0;
964 }
965
966 /*
967  * Delete/get qdisc.
968  */
969
970 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
971 {
972         struct net *net = sock_net(skb->sk);
973         struct tcmsg *tcm = NLMSG_DATA(n);
974         struct nlattr *tca[TCA_MAX + 1];
975         struct net_device *dev;
976         u32 clid = tcm->tcm_parent;
977         struct Qdisc *q = NULL;
978         struct Qdisc *p = NULL;
979         int err;
980
981         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
982         if (!dev)
983                 return -ENODEV;
984
985         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
986         if (err < 0)
987                 return err;
988
989         if (clid) {
990                 if (clid != TC_H_ROOT) {
991                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
992                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
993                                 if (!p)
994                                         return -ENOENT;
995                                 q = qdisc_leaf(p, clid);
996                         } else if (dev_ingress_queue(dev)) {
997                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
998                         }
999                 } else {
1000                         q = dev->qdisc;
1001                 }
1002                 if (!q)
1003                         return -ENOENT;
1004
1005                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1006                         return -EINVAL;
1007         } else {
1008                 q = qdisc_lookup(dev, tcm->tcm_handle);
1009                 if (!q)
1010                         return -ENOENT;
1011         }
1012
1013         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1014                 return -EINVAL;
1015
1016         if (n->nlmsg_type == RTM_DELQDISC) {
1017                 if (!clid)
1018                         return -EINVAL;
1019                 if (q->handle == 0)
1020                         return -ENOENT;
1021                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1022                 if (err != 0)
1023                         return err;
1024         } else {
1025                 qdisc_notify(net, skb, n, clid, NULL, q);
1026         }
1027         return 0;
1028 }
1029
1030 /*
1031  * Create/change qdisc.
1032  */
1033
1034 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1035 {
1036         struct net *net = sock_net(skb->sk);
1037         struct tcmsg *tcm;
1038         struct nlattr *tca[TCA_MAX + 1];
1039         struct net_device *dev;
1040         u32 clid;
1041         struct Qdisc *q, *p;
1042         int err;
1043
1044 replay:
1045         /* Reinit, just in case something touches this. */
1046         tcm = NLMSG_DATA(n);
1047         clid = tcm->tcm_parent;
1048         q = p = NULL;
1049
1050         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1051         if (!dev)
1052                 return -ENODEV;
1053
1054         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1055         if (err < 0)
1056                 return err;
1057
1058         if (clid) {
1059                 if (clid != TC_H_ROOT) {
1060                         if (clid != TC_H_INGRESS) {
1061                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1062                                 if (!p)
1063                                         return -ENOENT;
1064                                 q = qdisc_leaf(p, clid);
1065                         } else if (dev_ingress_queue_create(dev)) {
1066                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1067                         }
1068                 } else {
1069                         q = dev->qdisc;
1070                 }
1071
1072                 /* It may be default qdisc, ignore it */
1073                 if (q && q->handle == 0)
1074                         q = NULL;
1075
1076                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1077                         if (tcm->tcm_handle) {
1078                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1079                                         return -EEXIST;
1080                                 if (TC_H_MIN(tcm->tcm_handle))
1081                                         return -EINVAL;
1082                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1083                                 if (!q)
1084                                         goto create_n_graft;
1085                                 if (n->nlmsg_flags & NLM_F_EXCL)
1086                                         return -EEXIST;
1087                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1088                                         return -EINVAL;
1089                                 if (q == p ||
1090                                     (p && check_loop(q, p, 0)))
1091                                         return -ELOOP;
1092                                 atomic_inc(&q->refcnt);
1093                                 goto graft;
1094                         } else {
1095                                 if (!q)
1096                                         goto create_n_graft;
1097
1098                                 /* This magic test requires explanation.
1099                                  *
1100                                  *   We know, that some child q is already
1101                                  *   attached to this parent and have choice:
1102                                  *   either to change it or to create/graft new one.
1103                                  *
1104                                  *   1. We are allowed to create/graft only
1105                                  *   if CREATE and REPLACE flags are set.
1106                                  *
1107                                  *   2. If EXCL is set, requestor wanted to say,
1108                                  *   that qdisc tcm_handle is not expected
1109                                  *   to exist, so that we choose create/graft too.
1110                                  *
1111                                  *   3. The last case is when no flags are set.
1112                                  *   Alas, it is sort of hole in API, we
1113                                  *   cannot decide what to do unambiguously.
1114                                  *   For now we select create/graft, if
1115                                  *   user gave KIND, which does not match existing.
1116                                  */
1117                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1118                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1119                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1120                                      (tca[TCA_KIND] &&
1121                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1122                                         goto create_n_graft;
1123                         }
1124                 }
1125         } else {
1126                 if (!tcm->tcm_handle)
1127                         return -EINVAL;
1128                 q = qdisc_lookup(dev, tcm->tcm_handle);
1129         }
1130
1131         /* Change qdisc parameters */
1132         if (q == NULL)
1133                 return -ENOENT;
1134         if (n->nlmsg_flags & NLM_F_EXCL)
1135                 return -EEXIST;
1136         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1137                 return -EINVAL;
1138         err = qdisc_change(q, tca);
1139         if (err == 0)
1140                 qdisc_notify(net, skb, n, clid, NULL, q);
1141         return err;
1142
1143 create_n_graft:
1144         if (!(n->nlmsg_flags & NLM_F_CREATE))
1145                 return -ENOENT;
1146         if (clid == TC_H_INGRESS) {
1147                 if (dev_ingress_queue(dev))
1148                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1149                                          tcm->tcm_parent, tcm->tcm_parent,
1150                                          tca, &err);
1151                 else
1152                         err = -ENOENT;
1153         } else {
1154                 struct netdev_queue *dev_queue;
1155
1156                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1157                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1158                 else if (p)
1159                         dev_queue = p->dev_queue;
1160                 else
1161                         dev_queue = netdev_get_tx_queue(dev, 0);
1162
1163                 q = qdisc_create(dev, dev_queue, p,
1164                                  tcm->tcm_parent, tcm->tcm_handle,
1165                                  tca, &err);
1166         }
1167         if (q == NULL) {
1168                 if (err == -EAGAIN)
1169                         goto replay;
1170                 return err;
1171         }
1172
1173 graft:
1174         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1175         if (err) {
1176                 if (q)
1177                         qdisc_destroy(q);
1178                 return err;
1179         }
1180
1181         return 0;
1182 }
1183
1184 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1185                          u32 pid, u32 seq, u16 flags, int event)
1186 {
1187         struct tcmsg *tcm;
1188         struct nlmsghdr  *nlh;
1189         unsigned char *b = skb_tail_pointer(skb);
1190         struct gnet_dump d;
1191         struct qdisc_size_table *stab;
1192
1193         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1194         tcm = NLMSG_DATA(nlh);
1195         tcm->tcm_family = AF_UNSPEC;
1196         tcm->tcm__pad1 = 0;
1197         tcm->tcm__pad2 = 0;
1198         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1199         tcm->tcm_parent = clid;
1200         tcm->tcm_handle = q->handle;
1201         tcm->tcm_info = atomic_read(&q->refcnt);
1202         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1203         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1204                 goto nla_put_failure;
1205         q->qstats.qlen = q->q.qlen;
1206
1207         stab = rtnl_dereference(q->stab);
1208         if (stab && qdisc_dump_stab(skb, stab) < 0)
1209                 goto nla_put_failure;
1210
1211         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1212                                          qdisc_root_sleeping_lock(q), &d) < 0)
1213                 goto nla_put_failure;
1214
1215         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1216                 goto nla_put_failure;
1217
1218         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1219             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1220             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1221                 goto nla_put_failure;
1222
1223         if (gnet_stats_finish_copy(&d) < 0)
1224                 goto nla_put_failure;
1225
1226         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1227         return skb->len;
1228
1229 nlmsg_failure:
1230 nla_put_failure:
1231         nlmsg_trim(skb, b);
1232         return -1;
1233 }
1234
1235 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1236 {
1237         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1238 }
1239
1240 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1241                         struct nlmsghdr *n, u32 clid,
1242                         struct Qdisc *old, struct Qdisc *new)
1243 {
1244         struct sk_buff *skb;
1245         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1246
1247         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1248         if (!skb)
1249                 return -ENOBUFS;
1250
1251         if (old && !tc_qdisc_dump_ignore(old)) {
1252                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1253                                   0, RTM_DELQDISC) < 0)
1254                         goto err_out;
1255         }
1256         if (new && !tc_qdisc_dump_ignore(new)) {
1257                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1258                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1259                         goto err_out;
1260         }
1261
1262         if (skb->len)
1263                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1264                                       n->nlmsg_flags & NLM_F_ECHO);
1265
1266 err_out:
1267         kfree_skb(skb);
1268         return -EINVAL;
1269 }
1270
1271 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1272                               struct netlink_callback *cb,
1273                               int *q_idx_p, int s_q_idx)
1274 {
1275         int ret = 0, q_idx = *q_idx_p;
1276         struct Qdisc *q;
1277
1278         if (!root)
1279                 return 0;
1280
1281         q = root;
1282         if (q_idx < s_q_idx) {
1283                 q_idx++;
1284         } else {
1285                 if (!tc_qdisc_dump_ignore(q) &&
1286                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1287                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1288                         goto done;
1289                 q_idx++;
1290         }
1291         list_for_each_entry(q, &root->list, list) {
1292                 if (q_idx < s_q_idx) {
1293                         q_idx++;
1294                         continue;
1295                 }
1296                 if (!tc_qdisc_dump_ignore(q) &&
1297                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1298                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1299                         goto done;
1300                 q_idx++;
1301         }
1302
1303 out:
1304         *q_idx_p = q_idx;
1305         return ret;
1306 done:
1307         ret = -1;
1308         goto out;
1309 }
1310
1311 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1312 {
1313         struct net *net = sock_net(skb->sk);
1314         int idx, q_idx;
1315         int s_idx, s_q_idx;
1316         struct net_device *dev;
1317
1318         s_idx = cb->args[0];
1319         s_q_idx = q_idx = cb->args[1];
1320
1321         rcu_read_lock();
1322         idx = 0;
1323         for_each_netdev_rcu(net, dev) {
1324                 struct netdev_queue *dev_queue;
1325
1326                 if (idx < s_idx)
1327                         goto cont;
1328                 if (idx > s_idx)
1329                         s_q_idx = 0;
1330                 q_idx = 0;
1331
1332                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1333                         goto done;
1334
1335                 dev_queue = dev_ingress_queue(dev);
1336                 if (dev_queue &&
1337                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1338                                        &q_idx, s_q_idx) < 0)
1339                         goto done;
1340
1341 cont:
1342                 idx++;
1343         }
1344
1345 done:
1346         rcu_read_unlock();
1347
1348         cb->args[0] = idx;
1349         cb->args[1] = q_idx;
1350
1351         return skb->len;
1352 }
1353
1354
1355
1356 /************************************************
1357  *      Traffic classes manipulation.           *
1358  ************************************************/
1359
1360
1361
1362 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1363 {
1364         struct net *net = sock_net(skb->sk);
1365         struct tcmsg *tcm = NLMSG_DATA(n);
1366         struct nlattr *tca[TCA_MAX + 1];
1367         struct net_device *dev;
1368         struct Qdisc *q = NULL;
1369         const struct Qdisc_class_ops *cops;
1370         unsigned long cl = 0;
1371         unsigned long new_cl;
1372         u32 pid = tcm->tcm_parent;
1373         u32 clid = tcm->tcm_handle;
1374         u32 qid = TC_H_MAJ(clid);
1375         int err;
1376
1377         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1378         if (!dev)
1379                 return -ENODEV;
1380
1381         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1382         if (err < 0)
1383                 return err;
1384
1385         /*
1386            parent == TC_H_UNSPEC - unspecified parent.
1387            parent == TC_H_ROOT   - class is root, which has no parent.
1388            parent == X:0         - parent is root class.
1389            parent == X:Y         - parent is a node in hierarchy.
1390            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1391
1392            handle == 0:0         - generate handle from kernel pool.
1393            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1394            handle == X:Y         - clear.
1395            handle == X:0         - root class.
1396          */
1397
1398         /* Step 1. Determine qdisc handle X:0 */
1399
1400         if (pid != TC_H_ROOT) {
1401                 u32 qid1 = TC_H_MAJ(pid);
1402
1403                 if (qid && qid1) {
1404                         /* If both majors are known, they must be identical. */
1405                         if (qid != qid1)
1406                                 return -EINVAL;
1407                 } else if (qid1) {
1408                         qid = qid1;
1409                 } else if (qid == 0)
1410                         qid = dev->qdisc->handle;
1411
1412                 /* Now qid is genuine qdisc handle consistent
1413                  * both with parent and child.
1414                  *
1415                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1416                  */
1417                 if (pid)
1418                         pid = TC_H_MAKE(qid, pid);
1419         } else {
1420                 if (qid == 0)
1421                         qid = dev->qdisc->handle;
1422         }
1423
1424         /* OK. Locate qdisc */
1425         q = qdisc_lookup(dev, qid);
1426         if (!q)
1427                 return -ENOENT;
1428
1429         /* An check that it supports classes */
1430         cops = q->ops->cl_ops;
1431         if (cops == NULL)
1432                 return -EINVAL;
1433
1434         /* Now try to get class */
1435         if (clid == 0) {
1436                 if (pid == TC_H_ROOT)
1437                         clid = qid;
1438         } else
1439                 clid = TC_H_MAKE(qid, clid);
1440
1441         if (clid)
1442                 cl = cops->get(q, clid);
1443
1444         if (cl == 0) {
1445                 err = -ENOENT;
1446                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1447                     !(n->nlmsg_flags & NLM_F_CREATE))
1448                         goto out;
1449         } else {
1450                 switch (n->nlmsg_type) {
1451                 case RTM_NEWTCLASS:
1452                         err = -EEXIST;
1453                         if (n->nlmsg_flags & NLM_F_EXCL)
1454                                 goto out;
1455                         break;
1456                 case RTM_DELTCLASS:
1457                         err = -EOPNOTSUPP;
1458                         if (cops->delete)
1459                                 err = cops->delete(q, cl);
1460                         if (err == 0)
1461                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1462                         goto out;
1463                 case RTM_GETTCLASS:
1464                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1465                         goto out;
1466                 default:
1467                         err = -EINVAL;
1468                         goto out;
1469                 }
1470         }
1471
1472         new_cl = cl;
1473         err = -EOPNOTSUPP;
1474         if (cops->change)
1475                 err = cops->change(q, clid, pid, tca, &new_cl);
1476         if (err == 0)
1477                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1478
1479 out:
1480         if (cl)
1481                 cops->put(q, cl);
1482
1483         return err;
1484 }
1485
1486
1487 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1488                           unsigned long cl,
1489                           u32 pid, u32 seq, u16 flags, int event)
1490 {
1491         struct tcmsg *tcm;
1492         struct nlmsghdr  *nlh;
1493         unsigned char *b = skb_tail_pointer(skb);
1494         struct gnet_dump d;
1495         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1496
1497         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1498         tcm = NLMSG_DATA(nlh);
1499         tcm->tcm_family = AF_UNSPEC;
1500         tcm->tcm__pad1 = 0;
1501         tcm->tcm__pad2 = 0;
1502         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1503         tcm->tcm_parent = q->handle;
1504         tcm->tcm_handle = q->handle;
1505         tcm->tcm_info = 0;
1506         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1507         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1508                 goto nla_put_failure;
1509
1510         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1511                                          qdisc_root_sleeping_lock(q), &d) < 0)
1512                 goto nla_put_failure;
1513
1514         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1515                 goto nla_put_failure;
1516
1517         if (gnet_stats_finish_copy(&d) < 0)
1518                 goto nla_put_failure;
1519
1520         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1521         return skb->len;
1522
1523 nlmsg_failure:
1524 nla_put_failure:
1525         nlmsg_trim(skb, b);
1526         return -1;
1527 }
1528
1529 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1530                          struct nlmsghdr *n, struct Qdisc *q,
1531                          unsigned long cl, int event)
1532 {
1533         struct sk_buff *skb;
1534         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1535
1536         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1537         if (!skb)
1538                 return -ENOBUFS;
1539
1540         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1541                 kfree_skb(skb);
1542                 return -EINVAL;
1543         }
1544
1545         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1546                               n->nlmsg_flags & NLM_F_ECHO);
1547 }
1548
1549 struct qdisc_dump_args {
1550         struct qdisc_walker     w;
1551         struct sk_buff          *skb;
1552         struct netlink_callback *cb;
1553 };
1554
1555 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1556 {
1557         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1558
1559         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1560                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1561 }
1562
1563 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1564                                 struct tcmsg *tcm, struct netlink_callback *cb,
1565                                 int *t_p, int s_t)
1566 {
1567         struct qdisc_dump_args arg;
1568
1569         if (tc_qdisc_dump_ignore(q) ||
1570             *t_p < s_t || !q->ops->cl_ops ||
1571             (tcm->tcm_parent &&
1572              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1573                 (*t_p)++;
1574                 return 0;
1575         }
1576         if (*t_p > s_t)
1577                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1578         arg.w.fn = qdisc_class_dump;
1579         arg.skb = skb;
1580         arg.cb = cb;
1581         arg.w.stop  = 0;
1582         arg.w.skip = cb->args[1];
1583         arg.w.count = 0;
1584         q->ops->cl_ops->walk(q, &arg.w);
1585         cb->args[1] = arg.w.count;
1586         if (arg.w.stop)
1587                 return -1;
1588         (*t_p)++;
1589         return 0;
1590 }
1591
1592 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1593                                struct tcmsg *tcm, struct netlink_callback *cb,
1594                                int *t_p, int s_t)
1595 {
1596         struct Qdisc *q;
1597
1598         if (!root)
1599                 return 0;
1600
1601         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1602                 return -1;
1603
1604         list_for_each_entry(q, &root->list, list) {
1605                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1606                         return -1;
1607         }
1608
1609         return 0;
1610 }
1611
1612 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1613 {
1614         struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1615         struct net *net = sock_net(skb->sk);
1616         struct netdev_queue *dev_queue;
1617         struct net_device *dev;
1618         int t, s_t;
1619
1620         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1621                 return 0;
1622         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1623         if (!dev)
1624                 return 0;
1625
1626         s_t = cb->args[0];
1627         t = 0;
1628
1629         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1630                 goto done;
1631
1632         dev_queue = dev_ingress_queue(dev);
1633         if (dev_queue &&
1634             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1635                                 &t, s_t) < 0)
1636                 goto done;
1637
1638 done:
1639         cb->args[0] = t;
1640
1641         dev_put(dev);
1642         return skb->len;
1643 }
1644
1645 /* Main classifier routine: scans classifier chain attached
1646  * to this qdisc, (optionally) tests for protocol and asks
1647  * specific classifiers.
1648  */
1649 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1650                        struct tcf_result *res)
1651 {
1652         __be16 protocol = skb->protocol;
1653         int err;
1654
1655         for (; tp; tp = tp->next) {
1656                 if (tp->protocol != protocol &&
1657                     tp->protocol != htons(ETH_P_ALL))
1658                         continue;
1659                 err = tp->classify(skb, tp, res);
1660
1661                 if (err >= 0) {
1662 #ifdef CONFIG_NET_CLS_ACT
1663                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1664                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1665 #endif
1666                         return err;
1667                 }
1668         }
1669         return -1;
1670 }
1671 EXPORT_SYMBOL(tc_classify_compat);
1672
1673 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1674                 struct tcf_result *res)
1675 {
1676         int err = 0;
1677 #ifdef CONFIG_NET_CLS_ACT
1678         const struct tcf_proto *otp = tp;
1679 reclassify:
1680 #endif
1681
1682         err = tc_classify_compat(skb, tp, res);
1683 #ifdef CONFIG_NET_CLS_ACT
1684         if (err == TC_ACT_RECLASSIFY) {
1685                 u32 verd = G_TC_VERD(skb->tc_verd);
1686                 tp = otp;
1687
1688                 if (verd++ >= MAX_REC_LOOP) {
1689                         if (net_ratelimit())
1690                                 pr_notice("%s: packet reclassify loop"
1691                                           " rule prio %u protocol %02x\n",
1692                                           tp->q->ops->id,
1693                                           tp->prio & 0xffff,
1694                                           ntohs(tp->protocol));
1695                         return TC_ACT_SHOT;
1696                 }
1697                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1698                 goto reclassify;
1699         }
1700 #endif
1701         return err;
1702 }
1703 EXPORT_SYMBOL(tc_classify);
1704
1705 void tcf_destroy(struct tcf_proto *tp)
1706 {
1707         tp->ops->destroy(tp);
1708         module_put(tp->ops->owner);
1709         kfree(tp);
1710 }
1711
1712 void tcf_destroy_chain(struct tcf_proto **fl)
1713 {
1714         struct tcf_proto *tp;
1715
1716         while ((tp = *fl) != NULL) {
1717                 *fl = tp->next;
1718                 tcf_destroy(tp);
1719         }
1720 }
1721 EXPORT_SYMBOL(tcf_destroy_chain);
1722
1723 #ifdef CONFIG_PROC_FS
1724 static int psched_show(struct seq_file *seq, void *v)
1725 {
1726         struct timespec ts;
1727
1728         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1729         seq_printf(seq, "%08x %08x %08x %08x\n",
1730                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1731                    1000000,
1732                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1733
1734         return 0;
1735 }
1736
1737 static int psched_open(struct inode *inode, struct file *file)
1738 {
1739         return single_open(file, psched_show, NULL);
1740 }
1741
1742 static const struct file_operations psched_fops = {
1743         .owner = THIS_MODULE,
1744         .open = psched_open,
1745         .read  = seq_read,
1746         .llseek = seq_lseek,
1747         .release = single_release,
1748 };
1749
1750 static int __net_init psched_net_init(struct net *net)
1751 {
1752         struct proc_dir_entry *e;
1753
1754         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1755         if (e == NULL)
1756                 return -ENOMEM;
1757
1758         return 0;
1759 }
1760
1761 static void __net_exit psched_net_exit(struct net *net)
1762 {
1763         proc_net_remove(net, "psched");
1764 }
1765 #else
1766 static int __net_init psched_net_init(struct net *net)
1767 {
1768         return 0;
1769 }
1770
1771 static void __net_exit psched_net_exit(struct net *net)
1772 {
1773 }
1774 #endif
1775
1776 static struct pernet_operations psched_net_ops = {
1777         .init = psched_net_init,
1778         .exit = psched_net_exit,
1779 };
1780
1781 static int __init pktsched_init(void)
1782 {
1783         int err;
1784
1785         err = register_pernet_subsys(&psched_net_ops);
1786         if (err) {
1787                 pr_err("pktsched_init: "
1788                        "cannot initialize per netns operations\n");
1789                 return err;
1790         }
1791
1792         register_qdisc(&pfifo_qdisc_ops);
1793         register_qdisc(&bfifo_qdisc_ops);
1794         register_qdisc(&pfifo_head_drop_qdisc_ops);
1795         register_qdisc(&mq_qdisc_ops);
1796
1797         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1798         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1799         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1800         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1801         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1802         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1803
1804         return 0;
1805 }
1806
1807 subsys_initcall(pktsched_init);