Merge mulgrave-w:git/scsi-misc-2.6
[pandora-kernel.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60         write_lock_bh(&qdisc_tree_lock);
61         spin_lock_bh(&dev->queue_lock);
62 }
63
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66         spin_unlock_bh(&dev->queue_lock);
67         write_unlock_bh(&qdisc_tree_lock);
68 }
69
70 /* 
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73
74    netif_tx_lock serializes accesses to device driver.
75
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79
80
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94         struct Qdisc *q = dev->qdisc;
95         struct sk_buff *skb;
96
97         /* Dequeue packet */
98         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99                 unsigned nolock = (dev->features & NETIF_F_LLTX);
100
101                 dev->gso_skb = NULL;
102
103                 /*
104                  * When the driver has LLTX set it does its own locking
105                  * in start_xmit. No need to add additional overhead by
106                  * locking again. These checks are worth it because
107                  * even uncongested locks can be quite expensive.
108                  * The driver can do trylock like here too, in case
109                  * of lock congestion it should return -1 and the packet
110                  * will be requeued.
111                  */
112                 if (!nolock) {
113                         if (!netif_tx_trylock(dev)) {
114                         collision:
115                                 /* So, someone grabbed the driver. */
116                                 
117                                 /* It may be transient configuration error,
118                                    when hard_start_xmit() recurses. We detect
119                                    it by checking xmit owner and drop the
120                                    packet when deadloop is detected.
121                                 */
122                                 if (dev->xmit_lock_owner == smp_processor_id()) {
123                                         kfree_skb(skb);
124                                         if (net_ratelimit())
125                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126                                         return -1;
127                                 }
128                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
129                                 goto requeue;
130                         }
131                 }
132                 
133                 {
134                         /* And release queue */
135                         spin_unlock(&dev->queue_lock);
136
137                         if (!netif_queue_stopped(dev)) {
138                                 int ret;
139
140                                 ret = dev_hard_start_xmit(skb, dev);
141                                 if (ret == NETDEV_TX_OK) { 
142                                         if (!nolock) {
143                                                 netif_tx_unlock(dev);
144                                         }
145                                         spin_lock(&dev->queue_lock);
146                                         return -1;
147                                 }
148                                 if (ret == NETDEV_TX_LOCKED && nolock) {
149                                         spin_lock(&dev->queue_lock);
150                                         goto collision; 
151                                 }
152                         }
153
154                         /* NETDEV_TX_BUSY - we need to requeue */
155                         /* Release the driver */
156                         if (!nolock) { 
157                                 netif_tx_unlock(dev);
158                         } 
159                         spin_lock(&dev->queue_lock);
160                         q = dev->qdisc;
161                 }
162
163                 /* Device kicked us out :(
164                    This is possible in three cases:
165
166                    0. driver is locked
167                    1. fastroute is enabled
168                    2. device cannot determine busy state
169                       before start of transmission (f.e. dialout)
170                    3. device is buggy (ppp)
171                  */
172
173 requeue:
174                 if (skb->next)
175                         dev->gso_skb = skb;
176                 else
177                         q->ops->requeue(skb, q);
178                 netif_schedule(dev);
179                 return 1;
180         }
181         BUG_ON((int) q->q.qlen < 0);
182         return q->q.qlen;
183 }
184
185 void __qdisc_run(struct net_device *dev)
186 {
187         if (unlikely(dev->qdisc == &noop_qdisc))
188                 goto out;
189
190         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191                 /* NOTHING */;
192
193 out:
194         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196
197 static void dev_watchdog(unsigned long arg)
198 {
199         struct net_device *dev = (struct net_device *)arg;
200
201         netif_tx_lock(dev);
202         if (dev->qdisc != &noop_qdisc) {
203                 if (netif_device_present(dev) &&
204                     netif_running(dev) &&
205                     netif_carrier_ok(dev)) {
206                         if (netif_queue_stopped(dev) &&
207                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208
209                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210                                        dev->name);
211                                 dev->tx_timeout(dev);
212                         }
213                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214                                 dev_hold(dev);
215                 }
216         }
217         netif_tx_unlock(dev);
218
219         dev_put(dev);
220 }
221
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224         init_timer(&dev->watchdog_timer);
225         dev->watchdog_timer.data = (unsigned long)dev;
226         dev->watchdog_timer.function = dev_watchdog;
227 }
228
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231         if (dev->tx_timeout) {
232                 if (dev->watchdog_timeo <= 0)
233                         dev->watchdog_timeo = 5*HZ;
234                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235                         dev_hold(dev);
236         }
237 }
238
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241         __netdev_watchdog_up(dev);
242 }
243
244 static void dev_watchdog_down(struct net_device *dev)
245 {
246         netif_tx_lock_bh(dev);
247         if (del_timer(&dev->watchdog_timer))
248                 dev_put(dev);
249         netif_tx_unlock_bh(dev);
250 }
251
252 void netif_carrier_on(struct net_device *dev)
253 {
254         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
255                 linkwatch_fire_event(dev);
256         if (netif_running(dev))
257                 __netdev_watchdog_up(dev);
258 }
259
260 void netif_carrier_off(struct net_device *dev)
261 {
262         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
263                 linkwatch_fire_event(dev);
264 }
265
266 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
267    under all circumstances. It is difficult to invent anything faster or
268    cheaper.
269  */
270
271 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
272 {
273         kfree_skb(skb);
274         return NET_XMIT_CN;
275 }
276
277 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
278 {
279         return NULL;
280 }
281
282 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
283 {
284         if (net_ratelimit())
285                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
286                        skb->dev->name);
287         kfree_skb(skb);
288         return NET_XMIT_CN;
289 }
290
291 struct Qdisc_ops noop_qdisc_ops = {
292         .id             =       "noop",
293         .priv_size      =       0,
294         .enqueue        =       noop_enqueue,
295         .dequeue        =       noop_dequeue,
296         .requeue        =       noop_requeue,
297         .owner          =       THIS_MODULE,
298 };
299
300 struct Qdisc noop_qdisc = {
301         .enqueue        =       noop_enqueue,
302         .dequeue        =       noop_dequeue,
303         .flags          =       TCQ_F_BUILTIN,
304         .ops            =       &noop_qdisc_ops,        
305         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
306 };
307
308 static struct Qdisc_ops noqueue_qdisc_ops = {
309         .id             =       "noqueue",
310         .priv_size      =       0,
311         .enqueue        =       noop_enqueue,
312         .dequeue        =       noop_dequeue,
313         .requeue        =       noop_requeue,
314         .owner          =       THIS_MODULE,
315 };
316
317 static struct Qdisc noqueue_qdisc = {
318         .enqueue        =       NULL,
319         .dequeue        =       noop_dequeue,
320         .flags          =       TCQ_F_BUILTIN,
321         .ops            =       &noqueue_qdisc_ops,
322         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
323 };
324
325
326 static const u8 prio2band[TC_PRIO_MAX+1] =
327         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
328
329 /* 3-band FIFO queue: old style, but should be a bit faster than
330    generic prio+fifo combination.
331  */
332
333 #define PFIFO_FAST_BANDS 3
334
335 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
336                                              struct Qdisc *qdisc)
337 {
338         struct sk_buff_head *list = qdisc_priv(qdisc);
339         return list + prio2band[skb->priority & TC_PRIO_MAX];
340 }
341
342 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
343 {
344         struct sk_buff_head *list = prio2list(skb, qdisc);
345
346         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
347                 qdisc->q.qlen++;
348                 return __qdisc_enqueue_tail(skb, qdisc, list);
349         }
350
351         return qdisc_drop(skb, qdisc);
352 }
353
354 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
355 {
356         int prio;
357         struct sk_buff_head *list = qdisc_priv(qdisc);
358
359         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
360                 if (!skb_queue_empty(list + prio)) {
361                         qdisc->q.qlen--;
362                         return __qdisc_dequeue_head(qdisc, list + prio);
363                 }
364         }
365
366         return NULL;
367 }
368
369 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
370 {
371         qdisc->q.qlen++;
372         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
373 }
374
375 static void pfifo_fast_reset(struct Qdisc* qdisc)
376 {
377         int prio;
378         struct sk_buff_head *list = qdisc_priv(qdisc);
379
380         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
381                 __qdisc_reset_queue(qdisc, list + prio);
382
383         qdisc->qstats.backlog = 0;
384         qdisc->q.qlen = 0;
385 }
386
387 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
388 {
389         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
390
391         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
392         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
393         return skb->len;
394
395 rtattr_failure:
396         return -1;
397 }
398
399 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
400 {
401         int prio;
402         struct sk_buff_head *list = qdisc_priv(qdisc);
403
404         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
405                 skb_queue_head_init(list + prio);
406
407         return 0;
408 }
409
410 static struct Qdisc_ops pfifo_fast_ops = {
411         .id             =       "pfifo_fast",
412         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
413         .enqueue        =       pfifo_fast_enqueue,
414         .dequeue        =       pfifo_fast_dequeue,
415         .requeue        =       pfifo_fast_requeue,
416         .init           =       pfifo_fast_init,
417         .reset          =       pfifo_fast_reset,
418         .dump           =       pfifo_fast_dump,
419         .owner          =       THIS_MODULE,
420 };
421
422 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
423 {
424         void *p;
425         struct Qdisc *sch;
426         unsigned int size;
427         int err = -ENOBUFS;
428
429         /* ensure that the Qdisc and the private data are 32-byte aligned */
430         size = QDISC_ALIGN(sizeof(*sch));
431         size += ops->priv_size + (QDISC_ALIGNTO - 1);
432
433         p = kzalloc(size, GFP_KERNEL);
434         if (!p)
435                 goto errout;
436         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
437         sch->padded = (char *) sch - (char *) p;
438
439         INIT_LIST_HEAD(&sch->list);
440         skb_queue_head_init(&sch->q);
441         sch->ops = ops;
442         sch->enqueue = ops->enqueue;
443         sch->dequeue = ops->dequeue;
444         sch->dev = dev;
445         dev_hold(dev);
446         sch->stats_lock = &dev->queue_lock;
447         atomic_set(&sch->refcnt, 1);
448
449         return sch;
450 errout:
451         return ERR_PTR(-err);
452 }
453
454 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
455 {
456         struct Qdisc *sch;
457         
458         sch = qdisc_alloc(dev, ops);
459         if (IS_ERR(sch))
460                 goto errout;
461
462         if (!ops->init || ops->init(sch, NULL) == 0)
463                 return sch;
464
465         qdisc_destroy(sch);
466 errout:
467         return NULL;
468 }
469
470 /* Under dev->queue_lock and BH! */
471
472 void qdisc_reset(struct Qdisc *qdisc)
473 {
474         struct Qdisc_ops *ops = qdisc->ops;
475
476         if (ops->reset)
477                 ops->reset(qdisc);
478 }
479
480 /* this is the rcu callback function to clean up a qdisc when there 
481  * are no further references to it */
482
483 static void __qdisc_destroy(struct rcu_head *head)
484 {
485         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
486         struct Qdisc_ops  *ops = qdisc->ops;
487
488 #ifdef CONFIG_NET_ESTIMATOR
489         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
490 #endif
491         write_lock(&qdisc_tree_lock);
492         if (ops->reset)
493                 ops->reset(qdisc);
494         if (ops->destroy)
495                 ops->destroy(qdisc);
496         write_unlock(&qdisc_tree_lock);
497         module_put(ops->owner);
498
499         dev_put(qdisc->dev);
500         kfree((char *) qdisc - qdisc->padded);
501 }
502
503 /* Under dev->queue_lock and BH! */
504
505 void qdisc_destroy(struct Qdisc *qdisc)
506 {
507         struct list_head cql = LIST_HEAD_INIT(cql);
508         struct Qdisc *cq, *q, *n;
509
510         if (qdisc->flags & TCQ_F_BUILTIN ||
511                 !atomic_dec_and_test(&qdisc->refcnt))
512                 return;
513
514         if (!list_empty(&qdisc->list)) {
515                 if (qdisc->ops->cl_ops == NULL)
516                         list_del(&qdisc->list);
517                 else
518                         list_move(&qdisc->list, &cql);
519         }
520
521         /* unlink inner qdiscs from dev->qdisc_list immediately */
522         list_for_each_entry(cq, &cql, list)
523                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
524                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
525                                 if (q->ops->cl_ops == NULL)
526                                         list_del_init(&q->list);
527                                 else
528                                         list_move_tail(&q->list, &cql);
529                         }
530         list_for_each_entry_safe(cq, n, &cql, list)
531                 list_del_init(&cq->list);
532
533         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
534 }
535
536 void dev_activate(struct net_device *dev)
537 {
538         /* No queueing discipline is attached to device;
539            create default one i.e. pfifo_fast for devices,
540            which need queueing and noqueue_qdisc for
541            virtual interfaces
542          */
543
544         if (dev->qdisc_sleeping == &noop_qdisc) {
545                 struct Qdisc *qdisc;
546                 if (dev->tx_queue_len) {
547                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
548                         if (qdisc == NULL) {
549                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
550                                 return;
551                         }
552                         write_lock_bh(&qdisc_tree_lock);
553                         list_add_tail(&qdisc->list, &dev->qdisc_list);
554                         write_unlock_bh(&qdisc_tree_lock);
555                 } else {
556                         qdisc =  &noqueue_qdisc;
557                 }
558                 write_lock_bh(&qdisc_tree_lock);
559                 dev->qdisc_sleeping = qdisc;
560                 write_unlock_bh(&qdisc_tree_lock);
561         }
562
563         if (!netif_carrier_ok(dev))
564                 /* Delay activation until next carrier-on event */
565                 return;
566
567         spin_lock_bh(&dev->queue_lock);
568         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
569         if (dev->qdisc != &noqueue_qdisc) {
570                 dev->trans_start = jiffies;
571                 dev_watchdog_up(dev);
572         }
573         spin_unlock_bh(&dev->queue_lock);
574 }
575
576 void dev_deactivate(struct net_device *dev)
577 {
578         struct Qdisc *qdisc;
579
580         spin_lock_bh(&dev->queue_lock);
581         qdisc = dev->qdisc;
582         dev->qdisc = &noop_qdisc;
583
584         qdisc_reset(qdisc);
585
586         spin_unlock_bh(&dev->queue_lock);
587
588         dev_watchdog_down(dev);
589
590         /* Wait for outstanding dev_queue_xmit calls. */
591         synchronize_rcu();
592
593         /* Wait for outstanding qdisc_run calls. */
594         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
595                 yield();
596
597         if (dev->gso_skb) {
598                 kfree_skb(dev->gso_skb);
599                 dev->gso_skb = NULL;
600         }
601 }
602
603 void dev_init_scheduler(struct net_device *dev)
604 {
605         qdisc_lock_tree(dev);
606         dev->qdisc = &noop_qdisc;
607         dev->qdisc_sleeping = &noop_qdisc;
608         INIT_LIST_HEAD(&dev->qdisc_list);
609         qdisc_unlock_tree(dev);
610
611         dev_watchdog_init(dev);
612 }
613
614 void dev_shutdown(struct net_device *dev)
615 {
616         struct Qdisc *qdisc;
617
618         qdisc_lock_tree(dev);
619         qdisc = dev->qdisc_sleeping;
620         dev->qdisc = &noop_qdisc;
621         dev->qdisc_sleeping = &noop_qdisc;
622         qdisc_destroy(qdisc);
623 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
624         if ((qdisc = dev->qdisc_ingress) != NULL) {
625                 dev->qdisc_ingress = NULL;
626                 qdisc_destroy(qdisc);
627         }
628 #endif
629         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
630         qdisc_unlock_tree(dev);
631 }
632
633 EXPORT_SYMBOL(__netdev_watchdog_up);
634 EXPORT_SYMBOL(netif_carrier_on);
635 EXPORT_SYMBOL(netif_carrier_off);
636 EXPORT_SYMBOL(noop_qdisc);
637 EXPORT_SYMBOL(noop_qdisc_ops);
638 EXPORT_SYMBOL(qdisc_create_dflt);
639 EXPORT_SYMBOL(qdisc_alloc);
640 EXPORT_SYMBOL(qdisc_destroy);
641 EXPORT_SYMBOL(qdisc_reset);
642 EXPORT_SYMBOL(qdisc_lock_tree);
643 EXPORT_SYMBOL(qdisc_unlock_tree);