tcp: make challenge acks less predictable
[pandora-kernel.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
430         nla_nest_end(skb, nest);
431
432         return skb->len;
433
434 nla_put_failure:
435         return -1;
436 }
437
438 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
439 {
440         int pkt_len, slot;
441
442         pkt_len = skb->len + stab->szopts.overhead;
443         if (unlikely(!stab->szopts.tsize))
444                 goto out;
445
446         slot = pkt_len + stab->szopts.cell_align;
447         if (unlikely(slot < 0))
448                 slot = 0;
449
450         slot >>= stab->szopts.cell_log;
451         if (likely(slot < stab->szopts.tsize))
452                 pkt_len = stab->data[slot];
453         else
454                 pkt_len = stab->data[stab->szopts.tsize - 1] *
455                                 (slot / stab->szopts.tsize) +
456                                 stab->data[slot % stab->szopts.tsize];
457
458         pkt_len <<= stab->szopts.size_log;
459 out:
460         if (unlikely(pkt_len < 1))
461                 pkt_len = 1;
462         qdisc_skb_cb(skb)->pkt_len = pkt_len;
463 }
464 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
465
466 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
467 {
468         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
469                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
470                         txt, qdisc->ops->id, qdisc->handle >> 16);
471                 qdisc->flags |= TCQ_F_WARN_NONWC;
472         }
473 }
474 EXPORT_SYMBOL(qdisc_warn_nonwc);
475
476 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
477 {
478         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
479                                                  timer);
480
481         qdisc_unthrottled(wd->qdisc);
482         __netif_schedule(qdisc_root(wd->qdisc));
483
484         return HRTIMER_NORESTART;
485 }
486
487 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
488 {
489         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
490         wd->timer.function = qdisc_watchdog;
491         wd->qdisc = qdisc;
492 }
493 EXPORT_SYMBOL(qdisc_watchdog_init);
494
495 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
496 {
497         ktime_t time;
498
499         if (test_bit(__QDISC_STATE_DEACTIVATED,
500                      &qdisc_root_sleeping(wd->qdisc)->state))
501                 return;
502
503         qdisc_throttled(wd->qdisc);
504         time = ktime_set(0, 0);
505         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
506         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
507 }
508 EXPORT_SYMBOL(qdisc_watchdog_schedule);
509
510 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511 {
512         hrtimer_cancel(&wd->timer);
513         qdisc_unthrottled(wd->qdisc);
514 }
515 EXPORT_SYMBOL(qdisc_watchdog_cancel);
516
517 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518 {
519         unsigned int size = n * sizeof(struct hlist_head), i;
520         struct hlist_head *h;
521
522         if (size <= PAGE_SIZE)
523                 h = kmalloc(size, GFP_KERNEL);
524         else
525                 h = (struct hlist_head *)
526                         __get_free_pages(GFP_KERNEL, get_order(size));
527
528         if (h != NULL) {
529                 for (i = 0; i < n; i++)
530                         INIT_HLIST_HEAD(&h[i]);
531         }
532         return h;
533 }
534
535 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536 {
537         unsigned int size = n * sizeof(struct hlist_head);
538
539         if (size <= PAGE_SIZE)
540                 kfree(h);
541         else
542                 free_pages((unsigned long)h, get_order(size));
543 }
544
545 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546 {
547         struct Qdisc_class_common *cl;
548         struct hlist_node *n, *next;
549         struct hlist_head *nhash, *ohash;
550         unsigned int nsize, nmask, osize;
551         unsigned int i, h;
552
553         /* Rehash when load factor exceeds 0.75 */
554         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555                 return;
556         nsize = clhash->hashsize * 2;
557         nmask = nsize - 1;
558         nhash = qdisc_class_hash_alloc(nsize);
559         if (nhash == NULL)
560                 return;
561
562         ohash = clhash->hash;
563         osize = clhash->hashsize;
564
565         sch_tree_lock(sch);
566         for (i = 0; i < osize; i++) {
567                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568                         h = qdisc_class_hash(cl->classid, nmask);
569                         hlist_add_head(&cl->hnode, &nhash[h]);
570                 }
571         }
572         clhash->hash     = nhash;
573         clhash->hashsize = nsize;
574         clhash->hashmask = nmask;
575         sch_tree_unlock(sch);
576
577         qdisc_class_hash_free(ohash, osize);
578 }
579 EXPORT_SYMBOL(qdisc_class_hash_grow);
580
581 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582 {
583         unsigned int size = 4;
584
585         clhash->hash = qdisc_class_hash_alloc(size);
586         if (clhash->hash == NULL)
587                 return -ENOMEM;
588         clhash->hashsize  = size;
589         clhash->hashmask  = size - 1;
590         clhash->hashelems = 0;
591         return 0;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_init);
594
595 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596 {
597         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598 }
599 EXPORT_SYMBOL(qdisc_class_hash_destroy);
600
601 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602                              struct Qdisc_class_common *cl)
603 {
604         unsigned int h;
605
606         INIT_HLIST_NODE(&cl->hnode);
607         h = qdisc_class_hash(cl->classid, clhash->hashmask);
608         hlist_add_head(&cl->hnode, &clhash->hash[h]);
609         clhash->hashelems++;
610 }
611 EXPORT_SYMBOL(qdisc_class_hash_insert);
612
613 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614                              struct Qdisc_class_common *cl)
615 {
616         hlist_del(&cl->hnode);
617         clhash->hashelems--;
618 }
619 EXPORT_SYMBOL(qdisc_class_hash_remove);
620
621 /* Allocate an unique handle from space managed by kernel */
622
623 static u32 qdisc_alloc_handle(struct net_device *dev)
624 {
625         int i = 0x10000;
626         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
627
628         do {
629                 autohandle += TC_H_MAKE(0x10000U, 0);
630                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
631                         autohandle = TC_H_MAKE(0x80000000U, 0);
632         } while (qdisc_lookup(dev, autohandle) && --i > 0);
633
634         return i > 0 ? autohandle : 0;
635 }
636
637 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
638 {
639         const struct Qdisc_class_ops *cops;
640         unsigned long cl;
641         u32 parentid;
642
643         if (n == 0)
644                 return;
645         while ((parentid = sch->parent)) {
646                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
647                         return;
648
649                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
650                 if (sch == NULL) {
651                         WARN_ON(parentid != TC_H_ROOT);
652                         return;
653                 }
654                 cops = sch->ops->cl_ops;
655                 if (cops->qlen_notify) {
656                         cl = cops->get(sch, parentid);
657                         cops->qlen_notify(sch, cl);
658                         cops->put(sch, cl);
659                 }
660                 sch->q.qlen -= n;
661         }
662 }
663 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
664
665 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
666                                struct nlmsghdr *n, u32 clid,
667                                struct Qdisc *old, struct Qdisc *new)
668 {
669         if (new || old)
670                 qdisc_notify(net, skb, n, clid, old, new);
671
672         if (old)
673                 qdisc_destroy(old);
674 }
675
676 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
677  * to device "dev".
678  *
679  * When appropriate send a netlink notification using 'skb'
680  * and "n".
681  *
682  * On success, destroy old qdisc.
683  */
684
685 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
686                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
687                        struct Qdisc *new, struct Qdisc *old)
688 {
689         struct Qdisc *q = old;
690         struct net *net = dev_net(dev);
691         int err = 0;
692
693         if (parent == NULL) {
694                 unsigned int i, num_q, ingress;
695
696                 ingress = 0;
697                 num_q = dev->num_tx_queues;
698                 if ((q && q->flags & TCQ_F_INGRESS) ||
699                     (new && new->flags & TCQ_F_INGRESS)) {
700                         num_q = 1;
701                         ingress = 1;
702                         if (!dev_ingress_queue(dev))
703                                 return -ENOENT;
704                 }
705
706                 if (dev->flags & IFF_UP)
707                         dev_deactivate(dev);
708
709                 if (new && new->ops->attach) {
710                         new->ops->attach(new);
711                         num_q = 0;
712                 }
713
714                 for (i = 0; i < num_q; i++) {
715                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
716
717                         if (!ingress)
718                                 dev_queue = netdev_get_tx_queue(dev, i);
719
720                         old = dev_graft_qdisc(dev_queue, new);
721                         if (new && i > 0)
722                                 atomic_inc(&new->refcnt);
723
724                         if (!ingress)
725                                 qdisc_destroy(old);
726                 }
727
728                 if (!ingress) {
729                         notify_and_destroy(net, skb, n, classid,
730                                            dev->qdisc, new);
731                         if (new && !new->ops->attach)
732                                 atomic_inc(&new->refcnt);
733                         dev->qdisc = new ? : &noop_qdisc;
734                 } else {
735                         notify_and_destroy(net, skb, n, classid, old, new);
736                 }
737
738                 if (dev->flags & IFF_UP)
739                         dev_activate(dev);
740         } else {
741                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
742
743                 err = -EOPNOTSUPP;
744                 if (cops && cops->graft) {
745                         unsigned long cl = cops->get(parent, classid);
746                         if (cl) {
747                                 err = cops->graft(parent, cl, new, &old);
748                                 cops->put(parent, cl);
749                         } else
750                                 err = -ENOENT;
751                 }
752                 if (!err)
753                         notify_and_destroy(net, skb, n, classid, old, new);
754         }
755         return err;
756 }
757
758 /* lockdep annotation is needed for ingress; egress gets it only for name */
759 static struct lock_class_key qdisc_tx_lock;
760 static struct lock_class_key qdisc_rx_lock;
761
762 /*
763    Allocate and initialize new qdisc.
764
765    Parameters are passed via opt.
766  */
767
768 static struct Qdisc *
769 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
770              struct Qdisc *p, u32 parent, u32 handle,
771              struct nlattr **tca, int *errp)
772 {
773         int err;
774         struct nlattr *kind = tca[TCA_KIND];
775         struct Qdisc *sch;
776         struct Qdisc_ops *ops;
777         struct qdisc_size_table *stab;
778
779         ops = qdisc_lookup_ops(kind);
780 #ifdef CONFIG_MODULES
781         if (ops == NULL && kind != NULL) {
782                 char name[IFNAMSIZ];
783                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
784                         /* We dropped the RTNL semaphore in order to
785                          * perform the module load.  So, even if we
786                          * succeeded in loading the module we have to
787                          * tell the caller to replay the request.  We
788                          * indicate this using -EAGAIN.
789                          * We replay the request because the device may
790                          * go away in the mean time.
791                          */
792                         rtnl_unlock();
793                         request_module("sch_%s", name);
794                         rtnl_lock();
795                         ops = qdisc_lookup_ops(kind);
796                         if (ops != NULL) {
797                                 /* We will try again qdisc_lookup_ops,
798                                  * so don't keep a reference.
799                                  */
800                                 module_put(ops->owner);
801                                 err = -EAGAIN;
802                                 goto err_out;
803                         }
804                 }
805         }
806 #endif
807
808         err = -ENOENT;
809         if (ops == NULL)
810                 goto err_out;
811
812         sch = qdisc_alloc(dev_queue, ops);
813         if (IS_ERR(sch)) {
814                 err = PTR_ERR(sch);
815                 goto err_out2;
816         }
817
818         sch->parent = parent;
819
820         if (handle == TC_H_INGRESS) {
821                 sch->flags |= TCQ_F_INGRESS;
822                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
823                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
824         } else {
825                 if (handle == 0) {
826                         handle = qdisc_alloc_handle(dev);
827                         err = -ENOMEM;
828                         if (handle == 0)
829                                 goto err_out3;
830                 }
831                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
832         }
833
834         sch->handle = handle;
835
836         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
837                 if (tca[TCA_STAB]) {
838                         stab = qdisc_get_stab(tca[TCA_STAB]);
839                         if (IS_ERR(stab)) {
840                                 err = PTR_ERR(stab);
841                                 goto err_out4;
842                         }
843                         rcu_assign_pointer(sch->stab, stab);
844                 }
845                 if (tca[TCA_RATE]) {
846                         spinlock_t *root_lock;
847
848                         err = -EOPNOTSUPP;
849                         if (sch->flags & TCQ_F_MQROOT)
850                                 goto err_out4;
851
852                         if ((sch->parent != TC_H_ROOT) &&
853                             !(sch->flags & TCQ_F_INGRESS) &&
854                             (!p || !(p->flags & TCQ_F_MQROOT)))
855                                 root_lock = qdisc_root_sleeping_lock(sch);
856                         else
857                                 root_lock = qdisc_lock(sch);
858
859                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
860                                                 root_lock, tca[TCA_RATE]);
861                         if (err)
862                                 goto err_out4;
863                 }
864
865                 qdisc_list_add(sch);
866
867                 return sch;
868         }
869 err_out3:
870         dev_put(dev);
871         kfree((char *) sch - sch->padded);
872 err_out2:
873         module_put(ops->owner);
874 err_out:
875         *errp = err;
876         return NULL;
877
878 err_out4:
879         /*
880          * Any broken qdiscs that would require a ops->reset() here?
881          * The qdisc was never in action so it shouldn't be necessary.
882          */
883         qdisc_put_stab(rtnl_dereference(sch->stab));
884         if (ops->destroy)
885                 ops->destroy(sch);
886         goto err_out3;
887 }
888
889 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
890 {
891         struct qdisc_size_table *ostab, *stab = NULL;
892         int err = 0;
893
894         if (tca[TCA_OPTIONS]) {
895                 if (sch->ops->change == NULL)
896                         return -EINVAL;
897                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
898                 if (err)
899                         return err;
900         }
901
902         if (tca[TCA_STAB]) {
903                 stab = qdisc_get_stab(tca[TCA_STAB]);
904                 if (IS_ERR(stab))
905                         return PTR_ERR(stab);
906         }
907
908         ostab = rtnl_dereference(sch->stab);
909         rcu_assign_pointer(sch->stab, stab);
910         qdisc_put_stab(ostab);
911
912         if (tca[TCA_RATE]) {
913                 /* NB: ignores errors from replace_estimator
914                    because change can't be undone. */
915                 if (sch->flags & TCQ_F_MQROOT)
916                         goto out;
917                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
918                                             qdisc_root_sleeping_lock(sch),
919                                             tca[TCA_RATE]);
920         }
921 out:
922         return 0;
923 }
924
925 struct check_loop_arg {
926         struct qdisc_walker     w;
927         struct Qdisc            *p;
928         int                     depth;
929 };
930
931 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
932
933 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
934 {
935         struct check_loop_arg   arg;
936
937         if (q->ops->cl_ops == NULL)
938                 return 0;
939
940         arg.w.stop = arg.w.skip = arg.w.count = 0;
941         arg.w.fn = check_loop_fn;
942         arg.depth = depth;
943         arg.p = p;
944         q->ops->cl_ops->walk(q, &arg.w);
945         return arg.w.stop ? -ELOOP : 0;
946 }
947
948 static int
949 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
950 {
951         struct Qdisc *leaf;
952         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
953         struct check_loop_arg *arg = (struct check_loop_arg *)w;
954
955         leaf = cops->leaf(q, cl);
956         if (leaf) {
957                 if (leaf == arg->p || arg->depth > 7)
958                         return -ELOOP;
959                 return check_loop(leaf, arg->p, arg->depth + 1);
960         }
961         return 0;
962 }
963
964 /*
965  * Delete/get qdisc.
966  */
967
968 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
969 {
970         struct net *net = sock_net(skb->sk);
971         struct tcmsg *tcm = NLMSG_DATA(n);
972         struct nlattr *tca[TCA_MAX + 1];
973         struct net_device *dev;
974         u32 clid = tcm->tcm_parent;
975         struct Qdisc *q = NULL;
976         struct Qdisc *p = NULL;
977         int err;
978
979         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
980         if (!dev)
981                 return -ENODEV;
982
983         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
984         if (err < 0)
985                 return err;
986
987         if (clid) {
988                 if (clid != TC_H_ROOT) {
989                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
990                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
991                                 if (!p)
992                                         return -ENOENT;
993                                 q = qdisc_leaf(p, clid);
994                         } else if (dev_ingress_queue(dev)) {
995                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
996                         }
997                 } else {
998                         q = dev->qdisc;
999                 }
1000                 if (!q)
1001                         return -ENOENT;
1002
1003                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1004                         return -EINVAL;
1005         } else {
1006                 q = qdisc_lookup(dev, tcm->tcm_handle);
1007                 if (!q)
1008                         return -ENOENT;
1009         }
1010
1011         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1012                 return -EINVAL;
1013
1014         if (n->nlmsg_type == RTM_DELQDISC) {
1015                 if (!clid)
1016                         return -EINVAL;
1017                 if (q->handle == 0)
1018                         return -ENOENT;
1019                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1020                 if (err != 0)
1021                         return err;
1022         } else {
1023                 qdisc_notify(net, skb, n, clid, NULL, q);
1024         }
1025         return 0;
1026 }
1027
1028 /*
1029  * Create/change qdisc.
1030  */
1031
1032 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1033 {
1034         struct net *net = sock_net(skb->sk);
1035         struct tcmsg *tcm;
1036         struct nlattr *tca[TCA_MAX + 1];
1037         struct net_device *dev;
1038         u32 clid;
1039         struct Qdisc *q, *p;
1040         int err;
1041
1042 replay:
1043         /* Reinit, just in case something touches this. */
1044         tcm = NLMSG_DATA(n);
1045         clid = tcm->tcm_parent;
1046         q = p = NULL;
1047
1048         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1049         if (!dev)
1050                 return -ENODEV;
1051
1052         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1053         if (err < 0)
1054                 return err;
1055
1056         if (clid) {
1057                 if (clid != TC_H_ROOT) {
1058                         if (clid != TC_H_INGRESS) {
1059                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1060                                 if (!p)
1061                                         return -ENOENT;
1062                                 q = qdisc_leaf(p, clid);
1063                         } else if (dev_ingress_queue_create(dev)) {
1064                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1065                         }
1066                 } else {
1067                         q = dev->qdisc;
1068                 }
1069
1070                 /* It may be default qdisc, ignore it */
1071                 if (q && q->handle == 0)
1072                         q = NULL;
1073
1074                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1075                         if (tcm->tcm_handle) {
1076                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1077                                         return -EEXIST;
1078                                 if (TC_H_MIN(tcm->tcm_handle))
1079                                         return -EINVAL;
1080                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1081                                 if (!q)
1082                                         goto create_n_graft;
1083                                 if (n->nlmsg_flags & NLM_F_EXCL)
1084                                         return -EEXIST;
1085                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1086                                         return -EINVAL;
1087                                 if (q == p ||
1088                                     (p && check_loop(q, p, 0)))
1089                                         return -ELOOP;
1090                                 atomic_inc(&q->refcnt);
1091                                 goto graft;
1092                         } else {
1093                                 if (!q)
1094                                         goto create_n_graft;
1095
1096                                 /* This magic test requires explanation.
1097                                  *
1098                                  *   We know, that some child q is already
1099                                  *   attached to this parent and have choice:
1100                                  *   either to change it or to create/graft new one.
1101                                  *
1102                                  *   1. We are allowed to create/graft only
1103                                  *   if CREATE and REPLACE flags are set.
1104                                  *
1105                                  *   2. If EXCL is set, requestor wanted to say,
1106                                  *   that qdisc tcm_handle is not expected
1107                                  *   to exist, so that we choose create/graft too.
1108                                  *
1109                                  *   3. The last case is when no flags are set.
1110                                  *   Alas, it is sort of hole in API, we
1111                                  *   cannot decide what to do unambiguously.
1112                                  *   For now we select create/graft, if
1113                                  *   user gave KIND, which does not match existing.
1114                                  */
1115                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1116                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1117                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1118                                      (tca[TCA_KIND] &&
1119                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1120                                         goto create_n_graft;
1121                         }
1122                 }
1123         } else {
1124                 if (!tcm->tcm_handle)
1125                         return -EINVAL;
1126                 q = qdisc_lookup(dev, tcm->tcm_handle);
1127         }
1128
1129         /* Change qdisc parameters */
1130         if (q == NULL)
1131                 return -ENOENT;
1132         if (n->nlmsg_flags & NLM_F_EXCL)
1133                 return -EEXIST;
1134         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1135                 return -EINVAL;
1136         err = qdisc_change(q, tca);
1137         if (err == 0)
1138                 qdisc_notify(net, skb, n, clid, NULL, q);
1139         return err;
1140
1141 create_n_graft:
1142         if (!(n->nlmsg_flags & NLM_F_CREATE))
1143                 return -ENOENT;
1144         if (clid == TC_H_INGRESS) {
1145                 if (dev_ingress_queue(dev))
1146                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1147                                          tcm->tcm_parent, tcm->tcm_parent,
1148                                          tca, &err);
1149                 else
1150                         err = -ENOENT;
1151         } else {
1152                 struct netdev_queue *dev_queue;
1153
1154                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1155                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1156                 else if (p)
1157                         dev_queue = p->dev_queue;
1158                 else
1159                         dev_queue = netdev_get_tx_queue(dev, 0);
1160
1161                 q = qdisc_create(dev, dev_queue, p,
1162                                  tcm->tcm_parent, tcm->tcm_handle,
1163                                  tca, &err);
1164         }
1165         if (q == NULL) {
1166                 if (err == -EAGAIN)
1167                         goto replay;
1168                 return err;
1169         }
1170
1171 graft:
1172         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1173         if (err) {
1174                 if (q)
1175                         qdisc_destroy(q);
1176                 return err;
1177         }
1178
1179         return 0;
1180 }
1181
1182 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1183                          u32 pid, u32 seq, u16 flags, int event)
1184 {
1185         struct tcmsg *tcm;
1186         struct nlmsghdr  *nlh;
1187         unsigned char *b = skb_tail_pointer(skb);
1188         struct gnet_dump d;
1189         struct qdisc_size_table *stab;
1190
1191         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1192         tcm = NLMSG_DATA(nlh);
1193         tcm->tcm_family = AF_UNSPEC;
1194         tcm->tcm__pad1 = 0;
1195         tcm->tcm__pad2 = 0;
1196         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1197         tcm->tcm_parent = clid;
1198         tcm->tcm_handle = q->handle;
1199         tcm->tcm_info = atomic_read(&q->refcnt);
1200         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1201         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1202                 goto nla_put_failure;
1203         q->qstats.qlen = q->q.qlen;
1204
1205         stab = rtnl_dereference(q->stab);
1206         if (stab && qdisc_dump_stab(skb, stab) < 0)
1207                 goto nla_put_failure;
1208
1209         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1210                                          qdisc_root_sleeping_lock(q), &d) < 0)
1211                 goto nla_put_failure;
1212
1213         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1214                 goto nla_put_failure;
1215
1216         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1217             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1218             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1219                 goto nla_put_failure;
1220
1221         if (gnet_stats_finish_copy(&d) < 0)
1222                 goto nla_put_failure;
1223
1224         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1225         return skb->len;
1226
1227 nlmsg_failure:
1228 nla_put_failure:
1229         nlmsg_trim(skb, b);
1230         return -1;
1231 }
1232
1233 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1234 {
1235         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1236 }
1237
1238 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1239                         struct nlmsghdr *n, u32 clid,
1240                         struct Qdisc *old, struct Qdisc *new)
1241 {
1242         struct sk_buff *skb;
1243         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1244
1245         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1246         if (!skb)
1247                 return -ENOBUFS;
1248
1249         if (old && !tc_qdisc_dump_ignore(old)) {
1250                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1251                                   0, RTM_DELQDISC) < 0)
1252                         goto err_out;
1253         }
1254         if (new && !tc_qdisc_dump_ignore(new)) {
1255                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1256                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1257                         goto err_out;
1258         }
1259
1260         if (skb->len)
1261                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1262                                       n->nlmsg_flags & NLM_F_ECHO);
1263
1264 err_out:
1265         kfree_skb(skb);
1266         return -EINVAL;
1267 }
1268
1269 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1270                               struct netlink_callback *cb,
1271                               int *q_idx_p, int s_q_idx)
1272 {
1273         int ret = 0, q_idx = *q_idx_p;
1274         struct Qdisc *q;
1275
1276         if (!root)
1277                 return 0;
1278
1279         q = root;
1280         if (q_idx < s_q_idx) {
1281                 q_idx++;
1282         } else {
1283                 if (!tc_qdisc_dump_ignore(q) &&
1284                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1285                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1286                         goto done;
1287                 q_idx++;
1288         }
1289         list_for_each_entry(q, &root->list, list) {
1290                 if (q_idx < s_q_idx) {
1291                         q_idx++;
1292                         continue;
1293                 }
1294                 if (!tc_qdisc_dump_ignore(q) &&
1295                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1296                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1297                         goto done;
1298                 q_idx++;
1299         }
1300
1301 out:
1302         *q_idx_p = q_idx;
1303         return ret;
1304 done:
1305         ret = -1;
1306         goto out;
1307 }
1308
1309 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1310 {
1311         struct net *net = sock_net(skb->sk);
1312         int idx, q_idx;
1313         int s_idx, s_q_idx;
1314         struct net_device *dev;
1315
1316         s_idx = cb->args[0];
1317         s_q_idx = q_idx = cb->args[1];
1318
1319         rcu_read_lock();
1320         idx = 0;
1321         for_each_netdev_rcu(net, dev) {
1322                 struct netdev_queue *dev_queue;
1323
1324                 if (idx < s_idx)
1325                         goto cont;
1326                 if (idx > s_idx)
1327                         s_q_idx = 0;
1328                 q_idx = 0;
1329
1330                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1331                         goto done;
1332
1333                 dev_queue = dev_ingress_queue(dev);
1334                 if (dev_queue &&
1335                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1336                                        &q_idx, s_q_idx) < 0)
1337                         goto done;
1338
1339 cont:
1340                 idx++;
1341         }
1342
1343 done:
1344         rcu_read_unlock();
1345
1346         cb->args[0] = idx;
1347         cb->args[1] = q_idx;
1348
1349         return skb->len;
1350 }
1351
1352
1353
1354 /************************************************
1355  *      Traffic classes manipulation.           *
1356  ************************************************/
1357
1358
1359
1360 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1361 {
1362         struct net *net = sock_net(skb->sk);
1363         struct tcmsg *tcm = NLMSG_DATA(n);
1364         struct nlattr *tca[TCA_MAX + 1];
1365         struct net_device *dev;
1366         struct Qdisc *q = NULL;
1367         const struct Qdisc_class_ops *cops;
1368         unsigned long cl = 0;
1369         unsigned long new_cl;
1370         u32 pid = tcm->tcm_parent;
1371         u32 clid = tcm->tcm_handle;
1372         u32 qid = TC_H_MAJ(clid);
1373         int err;
1374
1375         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376         if (!dev)
1377                 return -ENODEV;
1378
1379         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1380         if (err < 0)
1381                 return err;
1382
1383         /*
1384            parent == TC_H_UNSPEC - unspecified parent.
1385            parent == TC_H_ROOT   - class is root, which has no parent.
1386            parent == X:0         - parent is root class.
1387            parent == X:Y         - parent is a node in hierarchy.
1388            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1389
1390            handle == 0:0         - generate handle from kernel pool.
1391            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1392            handle == X:Y         - clear.
1393            handle == X:0         - root class.
1394          */
1395
1396         /* Step 1. Determine qdisc handle X:0 */
1397
1398         if (pid != TC_H_ROOT) {
1399                 u32 qid1 = TC_H_MAJ(pid);
1400
1401                 if (qid && qid1) {
1402                         /* If both majors are known, they must be identical. */
1403                         if (qid != qid1)
1404                                 return -EINVAL;
1405                 } else if (qid1) {
1406                         qid = qid1;
1407                 } else if (qid == 0)
1408                         qid = dev->qdisc->handle;
1409
1410                 /* Now qid is genuine qdisc handle consistent
1411                  * both with parent and child.
1412                  *
1413                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1414                  */
1415                 if (pid)
1416                         pid = TC_H_MAKE(qid, pid);
1417         } else {
1418                 if (qid == 0)
1419                         qid = dev->qdisc->handle;
1420         }
1421
1422         /* OK. Locate qdisc */
1423         q = qdisc_lookup(dev, qid);
1424         if (!q)
1425                 return -ENOENT;
1426
1427         /* An check that it supports classes */
1428         cops = q->ops->cl_ops;
1429         if (cops == NULL)
1430                 return -EINVAL;
1431
1432         /* Now try to get class */
1433         if (clid == 0) {
1434                 if (pid == TC_H_ROOT)
1435                         clid = qid;
1436         } else
1437                 clid = TC_H_MAKE(qid, clid);
1438
1439         if (clid)
1440                 cl = cops->get(q, clid);
1441
1442         if (cl == 0) {
1443                 err = -ENOENT;
1444                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1445                     !(n->nlmsg_flags & NLM_F_CREATE))
1446                         goto out;
1447         } else {
1448                 switch (n->nlmsg_type) {
1449                 case RTM_NEWTCLASS:
1450                         err = -EEXIST;
1451                         if (n->nlmsg_flags & NLM_F_EXCL)
1452                                 goto out;
1453                         break;
1454                 case RTM_DELTCLASS:
1455                         err = -EOPNOTSUPP;
1456                         if (cops->delete)
1457                                 err = cops->delete(q, cl);
1458                         if (err == 0)
1459                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1460                         goto out;
1461                 case RTM_GETTCLASS:
1462                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1463                         goto out;
1464                 default:
1465                         err = -EINVAL;
1466                         goto out;
1467                 }
1468         }
1469
1470         new_cl = cl;
1471         err = -EOPNOTSUPP;
1472         if (cops->change)
1473                 err = cops->change(q, clid, pid, tca, &new_cl);
1474         if (err == 0)
1475                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1476
1477 out:
1478         if (cl)
1479                 cops->put(q, cl);
1480
1481         return err;
1482 }
1483
1484
1485 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1486                           unsigned long cl,
1487                           u32 pid, u32 seq, u16 flags, int event)
1488 {
1489         struct tcmsg *tcm;
1490         struct nlmsghdr  *nlh;
1491         unsigned char *b = skb_tail_pointer(skb);
1492         struct gnet_dump d;
1493         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1494
1495         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1496         tcm = NLMSG_DATA(nlh);
1497         tcm->tcm_family = AF_UNSPEC;
1498         tcm->tcm__pad1 = 0;
1499         tcm->tcm__pad2 = 0;
1500         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1501         tcm->tcm_parent = q->handle;
1502         tcm->tcm_handle = q->handle;
1503         tcm->tcm_info = 0;
1504         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1505         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1506                 goto nla_put_failure;
1507
1508         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1509                                          qdisc_root_sleeping_lock(q), &d) < 0)
1510                 goto nla_put_failure;
1511
1512         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1513                 goto nla_put_failure;
1514
1515         if (gnet_stats_finish_copy(&d) < 0)
1516                 goto nla_put_failure;
1517
1518         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1519         return skb->len;
1520
1521 nlmsg_failure:
1522 nla_put_failure:
1523         nlmsg_trim(skb, b);
1524         return -1;
1525 }
1526
1527 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1528                          struct nlmsghdr *n, struct Qdisc *q,
1529                          unsigned long cl, int event)
1530 {
1531         struct sk_buff *skb;
1532         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1533
1534         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1535         if (!skb)
1536                 return -ENOBUFS;
1537
1538         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1539                 kfree_skb(skb);
1540                 return -EINVAL;
1541         }
1542
1543         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1544                               n->nlmsg_flags & NLM_F_ECHO);
1545 }
1546
1547 struct qdisc_dump_args {
1548         struct qdisc_walker     w;
1549         struct sk_buff          *skb;
1550         struct netlink_callback *cb;
1551 };
1552
1553 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1554 {
1555         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1556
1557         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1558                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1559 }
1560
1561 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1562                                 struct tcmsg *tcm, struct netlink_callback *cb,
1563                                 int *t_p, int s_t)
1564 {
1565         struct qdisc_dump_args arg;
1566
1567         if (tc_qdisc_dump_ignore(q) ||
1568             *t_p < s_t || !q->ops->cl_ops ||
1569             (tcm->tcm_parent &&
1570              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1571                 (*t_p)++;
1572                 return 0;
1573         }
1574         if (*t_p > s_t)
1575                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1576         arg.w.fn = qdisc_class_dump;
1577         arg.skb = skb;
1578         arg.cb = cb;
1579         arg.w.stop  = 0;
1580         arg.w.skip = cb->args[1];
1581         arg.w.count = 0;
1582         q->ops->cl_ops->walk(q, &arg.w);
1583         cb->args[1] = arg.w.count;
1584         if (arg.w.stop)
1585                 return -1;
1586         (*t_p)++;
1587         return 0;
1588 }
1589
1590 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1591                                struct tcmsg *tcm, struct netlink_callback *cb,
1592                                int *t_p, int s_t)
1593 {
1594         struct Qdisc *q;
1595
1596         if (!root)
1597                 return 0;
1598
1599         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1600                 return -1;
1601
1602         list_for_each_entry(q, &root->list, list) {
1603                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1604                         return -1;
1605         }
1606
1607         return 0;
1608 }
1609
1610 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1611 {
1612         struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1613         struct net *net = sock_net(skb->sk);
1614         struct netdev_queue *dev_queue;
1615         struct net_device *dev;
1616         int t, s_t;
1617
1618         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1619                 return 0;
1620         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1621         if (!dev)
1622                 return 0;
1623
1624         s_t = cb->args[0];
1625         t = 0;
1626
1627         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1628                 goto done;
1629
1630         dev_queue = dev_ingress_queue(dev);
1631         if (dev_queue &&
1632             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1633                                 &t, s_t) < 0)
1634                 goto done;
1635
1636 done:
1637         cb->args[0] = t;
1638
1639         dev_put(dev);
1640         return skb->len;
1641 }
1642
1643 /* Main classifier routine: scans classifier chain attached
1644  * to this qdisc, (optionally) tests for protocol and asks
1645  * specific classifiers.
1646  */
1647 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1648                        struct tcf_result *res)
1649 {
1650         __be16 protocol = skb->protocol;
1651         int err;
1652
1653         for (; tp; tp = tp->next) {
1654                 if (tp->protocol != protocol &&
1655                     tp->protocol != htons(ETH_P_ALL))
1656                         continue;
1657                 err = tp->classify(skb, tp, res);
1658
1659                 if (err >= 0) {
1660 #ifdef CONFIG_NET_CLS_ACT
1661                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1662                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1663 #endif
1664                         return err;
1665                 }
1666         }
1667         return -1;
1668 }
1669 EXPORT_SYMBOL(tc_classify_compat);
1670
1671 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1672                 struct tcf_result *res)
1673 {
1674         int err = 0;
1675 #ifdef CONFIG_NET_CLS_ACT
1676         const struct tcf_proto *otp = tp;
1677 reclassify:
1678 #endif
1679
1680         err = tc_classify_compat(skb, tp, res);
1681 #ifdef CONFIG_NET_CLS_ACT
1682         if (err == TC_ACT_RECLASSIFY) {
1683                 u32 verd = G_TC_VERD(skb->tc_verd);
1684                 tp = otp;
1685
1686                 if (verd++ >= MAX_REC_LOOP) {
1687                         if (net_ratelimit())
1688                                 pr_notice("%s: packet reclassify loop"
1689                                           " rule prio %u protocol %02x\n",
1690                                           tp->q->ops->id,
1691                                           tp->prio & 0xffff,
1692                                           ntohs(tp->protocol));
1693                         return TC_ACT_SHOT;
1694                 }
1695                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1696                 goto reclassify;
1697         }
1698 #endif
1699         return err;
1700 }
1701 EXPORT_SYMBOL(tc_classify);
1702
1703 void tcf_destroy(struct tcf_proto *tp)
1704 {
1705         tp->ops->destroy(tp);
1706         module_put(tp->ops->owner);
1707         kfree(tp);
1708 }
1709
1710 void tcf_destroy_chain(struct tcf_proto **fl)
1711 {
1712         struct tcf_proto *tp;
1713
1714         while ((tp = *fl) != NULL) {
1715                 *fl = tp->next;
1716                 tcf_destroy(tp);
1717         }
1718 }
1719 EXPORT_SYMBOL(tcf_destroy_chain);
1720
1721 #ifdef CONFIG_PROC_FS
1722 static int psched_show(struct seq_file *seq, void *v)
1723 {
1724         struct timespec ts;
1725
1726         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1727         seq_printf(seq, "%08x %08x %08x %08x\n",
1728                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1729                    1000000,
1730                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1731
1732         return 0;
1733 }
1734
1735 static int psched_open(struct inode *inode, struct file *file)
1736 {
1737         return single_open(file, psched_show, NULL);
1738 }
1739
1740 static const struct file_operations psched_fops = {
1741         .owner = THIS_MODULE,
1742         .open = psched_open,
1743         .read  = seq_read,
1744         .llseek = seq_lseek,
1745         .release = single_release,
1746 };
1747
1748 static int __net_init psched_net_init(struct net *net)
1749 {
1750         struct proc_dir_entry *e;
1751
1752         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1753         if (e == NULL)
1754                 return -ENOMEM;
1755
1756         return 0;
1757 }
1758
1759 static void __net_exit psched_net_exit(struct net *net)
1760 {
1761         proc_net_remove(net, "psched");
1762 }
1763 #else
1764 static int __net_init psched_net_init(struct net *net)
1765 {
1766         return 0;
1767 }
1768
1769 static void __net_exit psched_net_exit(struct net *net)
1770 {
1771 }
1772 #endif
1773
1774 static struct pernet_operations psched_net_ops = {
1775         .init = psched_net_init,
1776         .exit = psched_net_exit,
1777 };
1778
1779 static int __init pktsched_init(void)
1780 {
1781         int err;
1782
1783         err = register_pernet_subsys(&psched_net_ops);
1784         if (err) {
1785                 pr_err("pktsched_init: "
1786                        "cannot initialize per netns operations\n");
1787                 return err;
1788         }
1789
1790         register_qdisc(&pfifo_qdisc_ops);
1791         register_qdisc(&bfifo_qdisc_ops);
1792         register_qdisc(&pfifo_head_drop_qdisc_ops);
1793         register_qdisc(&mq_qdisc_ops);
1794
1795         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1796         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1797         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1798         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1799         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1800         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1801
1802         return 0;
1803 }
1804
1805 subsys_initcall(pktsched_init);