Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
[pandora-kernel.git] / net / sched / sch_teql.c
1 /* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
2  *
3  *              This program is free software; you can redistribute it and/or
4  *              modify it under the terms of the GNU General Public License
5  *              as published by the Free Software Foundation; either version
6  *              2 of the License, or (at your option) any later version.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  */
10
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/kernel.h>
14 #include <linux/slab.h>
15 #include <linux/string.h>
16 #include <linux/errno.h>
17 #include <linux/if_arp.h>
18 #include <linux/netdevice.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/moduleparam.h>
22 #include <net/dst.h>
23 #include <net/neighbour.h>
24 #include <net/pkt_sched.h>
25
26 /*
27    How to setup it.
28    ----------------
29
30    After loading this module you will find a new device teqlN
31    and new qdisc with the same name. To join a slave to the equalizer
32    you should just set this qdisc on a device f.e.
33
34    # tc qdisc add dev eth0 root teql0
35    # tc qdisc add dev eth1 root teql0
36
37    That's all. Full PnP 8)
38
39    Applicability.
40    --------------
41
42    1. Slave devices MUST be active devices, i.e., they must raise the tbusy
43       signal and generate EOI events. If you want to equalize virtual devices
44       like tunnels, use a normal eql device.
45    2. This device puts no limitations on physical slave characteristics
46       f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
47       Certainly, large difference in link speeds will make the resulting
48       eqalized link unusable, because of huge packet reordering.
49       I estimate an upper useful difference as ~10 times.
50    3. If the slave requires address resolution, only protocols using
51       neighbour cache (IPv4/IPv6) will work over the equalized link.
52       Other protocols are still allowed to use the slave device directly,
53       which will not break load balancing, though native slave
54       traffic will have the highest priority.  */
55
56 struct teql_master {
57         struct Qdisc_ops qops;
58         struct net_device *dev;
59         struct Qdisc *slaves;
60         struct list_head master_list;
61         unsigned long   tx_bytes;
62         unsigned long   tx_packets;
63         unsigned long   tx_errors;
64         unsigned long   tx_dropped;
65 };
66
67 struct teql_sched_data {
68         struct Qdisc *next;
69         struct teql_master *m;
70         struct neighbour *ncache;
71         struct sk_buff_head q;
72 };
73
74 #define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
75
76 #define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
77
78 /* "teql*" qdisc routines */
79
80 static int
81 teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
82 {
83         struct net_device *dev = qdisc_dev(sch);
84         struct teql_sched_data *q = qdisc_priv(sch);
85
86         if (q->q.qlen < dev->tx_queue_len) {
87                 __skb_queue_tail(&q->q, skb);
88                 return NET_XMIT_SUCCESS;
89         }
90
91         kfree_skb(skb);
92         sch->qstats.drops++;
93         return NET_XMIT_DROP;
94 }
95
96 static struct sk_buff *
97 teql_dequeue(struct Qdisc *sch)
98 {
99         struct teql_sched_data *dat = qdisc_priv(sch);
100         struct netdev_queue *dat_queue;
101         struct sk_buff *skb;
102
103         skb = __skb_dequeue(&dat->q);
104         dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
105         if (skb == NULL) {
106                 struct net_device *m = qdisc_dev(dat_queue->qdisc);
107                 if (m) {
108                         dat->m->slaves = sch;
109                         netif_wake_queue(m);
110                 }
111         } else {
112                 qdisc_bstats_update(sch, skb);
113         }
114         sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
115         return skb;
116 }
117
118 static struct sk_buff *
119 teql_peek(struct Qdisc *sch)
120 {
121         /* teql is meant to be used as root qdisc */
122         return NULL;
123 }
124
125 static inline void
126 teql_neigh_release(struct neighbour *n)
127 {
128         if (n)
129                 neigh_release(n);
130 }
131
132 static void
133 teql_reset(struct Qdisc *sch)
134 {
135         struct teql_sched_data *dat = qdisc_priv(sch);
136
137         skb_queue_purge(&dat->q);
138         sch->q.qlen = 0;
139         teql_neigh_release(xchg(&dat->ncache, NULL));
140 }
141
142 static void
143 teql_destroy(struct Qdisc *sch)
144 {
145         struct Qdisc *q, *prev;
146         struct teql_sched_data *dat = qdisc_priv(sch);
147         struct teql_master *master = dat->m;
148
149         prev = master->slaves;
150         if (prev) {
151                 do {
152                         q = NEXT_SLAVE(prev);
153                         if (q == sch) {
154                                 NEXT_SLAVE(prev) = NEXT_SLAVE(q);
155                                 if (q == master->slaves) {
156                                         master->slaves = NEXT_SLAVE(q);
157                                         if (q == master->slaves) {
158                                                 struct netdev_queue *txq;
159                                                 spinlock_t *root_lock;
160
161                                                 txq = netdev_get_tx_queue(master->dev, 0);
162                                                 master->slaves = NULL;
163
164                                                 root_lock = qdisc_root_sleeping_lock(txq->qdisc);
165                                                 spin_lock_bh(root_lock);
166                                                 qdisc_reset(txq->qdisc);
167                                                 spin_unlock_bh(root_lock);
168                                         }
169                                 }
170                                 skb_queue_purge(&dat->q);
171                                 teql_neigh_release(xchg(&dat->ncache, NULL));
172                                 break;
173                         }
174
175                 } while ((prev = q) != master->slaves);
176         }
177 }
178
179 static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
180 {
181         struct net_device *dev = qdisc_dev(sch);
182         struct teql_master *m = (struct teql_master *)sch->ops;
183         struct teql_sched_data *q = qdisc_priv(sch);
184
185         if (dev->hard_header_len > m->dev->hard_header_len)
186                 return -EINVAL;
187
188         if (m->dev == dev)
189                 return -ELOOP;
190
191         q->m = m;
192
193         skb_queue_head_init(&q->q);
194
195         if (m->slaves) {
196                 if (m->dev->flags & IFF_UP) {
197                         if ((m->dev->flags & IFF_POINTOPOINT &&
198                              !(dev->flags & IFF_POINTOPOINT)) ||
199                             (m->dev->flags & IFF_BROADCAST &&
200                              !(dev->flags & IFF_BROADCAST)) ||
201                             (m->dev->flags & IFF_MULTICAST &&
202                              !(dev->flags & IFF_MULTICAST)) ||
203                             dev->mtu < m->dev->mtu)
204                                 return -EINVAL;
205                 } else {
206                         if (!(dev->flags&IFF_POINTOPOINT))
207                                 m->dev->flags &= ~IFF_POINTOPOINT;
208                         if (!(dev->flags&IFF_BROADCAST))
209                                 m->dev->flags &= ~IFF_BROADCAST;
210                         if (!(dev->flags&IFF_MULTICAST))
211                                 m->dev->flags &= ~IFF_MULTICAST;
212                         if (dev->mtu < m->dev->mtu)
213                                 m->dev->mtu = dev->mtu;
214                 }
215                 q->next = NEXT_SLAVE(m->slaves);
216                 NEXT_SLAVE(m->slaves) = sch;
217         } else {
218                 q->next = sch;
219                 m->slaves = sch;
220                 m->dev->mtu = dev->mtu;
221                 m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
222         }
223         return 0;
224 }
225
226
227 static int
228 __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
229                struct net_device *dev, struct netdev_queue *txq,
230                struct neighbour *mn)
231 {
232         struct teql_sched_data *q = qdisc_priv(txq->qdisc);
233         struct neighbour *n = q->ncache;
234
235         if (mn->tbl == NULL)
236                 return -EINVAL;
237         if (n && n->tbl == mn->tbl &&
238             memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
239                 atomic_inc(&n->refcnt);
240         } else {
241                 n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
242                 if (IS_ERR(n))
243                         return PTR_ERR(n);
244         }
245         if (neigh_event_send(n, skb_res) == 0) {
246                 int err;
247                 char haddr[MAX_ADDR_LEN];
248
249                 neigh_ha_snapshot(haddr, n, dev);
250                 err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
251                                       NULL, skb->len);
252
253                 if (err < 0) {
254                         neigh_release(n);
255                         return -EINVAL;
256                 }
257                 teql_neigh_release(xchg(&q->ncache, n));
258                 return 0;
259         }
260         neigh_release(n);
261         return (skb_res == NULL) ? -EAGAIN : 1;
262 }
263
264 static inline int teql_resolve(struct sk_buff *skb,
265                                struct sk_buff *skb_res,
266                                struct net_device *dev,
267                                struct netdev_queue *txq)
268 {
269         struct dst_entry *dst = skb_dst(skb);
270         struct neighbour *mn;
271         int res;
272
273         if (txq->qdisc == &noop_qdisc)
274                 return -ENODEV;
275
276         if (!dev->header_ops || !dst)
277                 return 0;
278
279         rcu_read_lock();
280         mn = dst_get_neighbour(dst);
281         res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
282         rcu_read_unlock();
283
284         return res;
285 }
286
287 static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
288 {
289         struct teql_master *master = netdev_priv(dev);
290         struct Qdisc *start, *q;
291         int busy;
292         int nores;
293         int subq = skb_get_queue_mapping(skb);
294         struct sk_buff *skb_res = NULL;
295
296         start = master->slaves;
297
298 restart:
299         nores = 0;
300         busy = 0;
301
302         q = start;
303         if (!q)
304                 goto drop;
305
306         do {
307                 struct net_device *slave = qdisc_dev(q);
308                 struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
309                 const struct net_device_ops *slave_ops = slave->netdev_ops;
310
311                 if (slave_txq->qdisc_sleeping != q)
312                         continue;
313                 if (__netif_subqueue_stopped(slave, subq) ||
314                     !netif_running(slave)) {
315                         busy = 1;
316                         continue;
317                 }
318
319                 switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
320                 case 0:
321                         if (__netif_tx_trylock(slave_txq)) {
322                                 unsigned int length = qdisc_pkt_len(skb);
323
324                                 if (!netif_tx_queue_frozen_or_stopped(slave_txq) &&
325                                     slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
326                                         txq_trans_update(slave_txq);
327                                         __netif_tx_unlock(slave_txq);
328                                         master->slaves = NEXT_SLAVE(q);
329                                         netif_wake_queue(dev);
330                                         master->tx_packets++;
331                                         master->tx_bytes += length;
332                                         return NETDEV_TX_OK;
333                                 }
334                                 __netif_tx_unlock(slave_txq);
335                         }
336                         if (netif_queue_stopped(dev))
337                                 busy = 1;
338                         break;
339                 case 1:
340                         master->slaves = NEXT_SLAVE(q);
341                         return NETDEV_TX_OK;
342                 default:
343                         nores = 1;
344                         break;
345                 }
346                 __skb_pull(skb, skb_network_offset(skb));
347         } while ((q = NEXT_SLAVE(q)) != start);
348
349         if (nores && skb_res == NULL) {
350                 skb_res = skb;
351                 goto restart;
352         }
353
354         if (busy) {
355                 netif_stop_queue(dev);
356                 return NETDEV_TX_BUSY;
357         }
358         master->tx_errors++;
359
360 drop:
361         master->tx_dropped++;
362         dev_kfree_skb(skb);
363         return NETDEV_TX_OK;
364 }
365
366 static int teql_master_open(struct net_device *dev)
367 {
368         struct Qdisc *q;
369         struct teql_master *m = netdev_priv(dev);
370         int mtu = 0xFFFE;
371         unsigned int flags = IFF_NOARP | IFF_MULTICAST;
372
373         if (m->slaves == NULL)
374                 return -EUNATCH;
375
376         flags = FMASK;
377
378         q = m->slaves;
379         do {
380                 struct net_device *slave = qdisc_dev(q);
381
382                 if (slave == NULL)
383                         return -EUNATCH;
384
385                 if (slave->mtu < mtu)
386                         mtu = slave->mtu;
387                 if (slave->hard_header_len > LL_MAX_HEADER)
388                         return -EINVAL;
389
390                 /* If all the slaves are BROADCAST, master is BROADCAST
391                    If all the slaves are PtP, master is PtP
392                    Otherwise, master is NBMA.
393                  */
394                 if (!(slave->flags&IFF_POINTOPOINT))
395                         flags &= ~IFF_POINTOPOINT;
396                 if (!(slave->flags&IFF_BROADCAST))
397                         flags &= ~IFF_BROADCAST;
398                 if (!(slave->flags&IFF_MULTICAST))
399                         flags &= ~IFF_MULTICAST;
400         } while ((q = NEXT_SLAVE(q)) != m->slaves);
401
402         m->dev->mtu = mtu;
403         m->dev->flags = (m->dev->flags&~FMASK) | flags;
404         netif_start_queue(m->dev);
405         return 0;
406 }
407
408 static int teql_master_close(struct net_device *dev)
409 {
410         netif_stop_queue(dev);
411         return 0;
412 }
413
414 static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
415                                                      struct rtnl_link_stats64 *stats)
416 {
417         struct teql_master *m = netdev_priv(dev);
418
419         stats->tx_packets       = m->tx_packets;
420         stats->tx_bytes         = m->tx_bytes;
421         stats->tx_errors        = m->tx_errors;
422         stats->tx_dropped       = m->tx_dropped;
423         return stats;
424 }
425
426 static int teql_master_mtu(struct net_device *dev, int new_mtu)
427 {
428         struct teql_master *m = netdev_priv(dev);
429         struct Qdisc *q;
430
431         if (new_mtu < 68)
432                 return -EINVAL;
433
434         q = m->slaves;
435         if (q) {
436                 do {
437                         if (new_mtu > qdisc_dev(q)->mtu)
438                                 return -EINVAL;
439                 } while ((q = NEXT_SLAVE(q)) != m->slaves);
440         }
441
442         dev->mtu = new_mtu;
443         return 0;
444 }
445
446 static const struct net_device_ops teql_netdev_ops = {
447         .ndo_open       = teql_master_open,
448         .ndo_stop       = teql_master_close,
449         .ndo_start_xmit = teql_master_xmit,
450         .ndo_get_stats64 = teql_master_stats64,
451         .ndo_change_mtu = teql_master_mtu,
452 };
453
454 static __init void teql_master_setup(struct net_device *dev)
455 {
456         struct teql_master *master = netdev_priv(dev);
457         struct Qdisc_ops *ops = &master->qops;
458
459         master->dev     = dev;
460         ops->priv_size  = sizeof(struct teql_sched_data);
461
462         ops->enqueue    =       teql_enqueue;
463         ops->dequeue    =       teql_dequeue;
464         ops->peek       =       teql_peek;
465         ops->init       =       teql_qdisc_init;
466         ops->reset      =       teql_reset;
467         ops->destroy    =       teql_destroy;
468         ops->owner      =       THIS_MODULE;
469
470         dev->netdev_ops =       &teql_netdev_ops;
471         dev->type               = ARPHRD_VOID;
472         dev->mtu                = 1500;
473         dev->tx_queue_len       = 100;
474         dev->flags              = IFF_NOARP;
475         dev->hard_header_len    = LL_MAX_HEADER;
476         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
477 }
478
479 static LIST_HEAD(master_dev_list);
480 static int max_equalizers = 1;
481 module_param(max_equalizers, int, 0);
482 MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
483
484 static int __init teql_init(void)
485 {
486         int i;
487         int err = -ENODEV;
488
489         for (i = 0; i < max_equalizers; i++) {
490                 struct net_device *dev;
491                 struct teql_master *master;
492
493                 dev = alloc_netdev(sizeof(struct teql_master),
494                                   "teql%d", teql_master_setup);
495                 if (!dev) {
496                         err = -ENOMEM;
497                         break;
498                 }
499
500                 if ((err = register_netdev(dev))) {
501                         free_netdev(dev);
502                         break;
503                 }
504
505                 master = netdev_priv(dev);
506
507                 strlcpy(master->qops.id, dev->name, IFNAMSIZ);
508                 err = register_qdisc(&master->qops);
509
510                 if (err) {
511                         unregister_netdev(dev);
512                         free_netdev(dev);
513                         break;
514                 }
515
516                 list_add_tail(&master->master_list, &master_dev_list);
517         }
518         return i ? 0 : err;
519 }
520
521 static void __exit teql_exit(void)
522 {
523         struct teql_master *master, *nxt;
524
525         list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
526
527                 list_del(&master->master_list);
528
529                 unregister_qdisc(&master->qops);
530                 unregister_netdev(master->dev);
531                 free_netdev(master->dev);
532         }
533 }
534
535 module_init(teql_init);
536 module_exit(teql_exit);
537
538 MODULE_LICENSE("GPL");