net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/errno.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/rtnetlink.h>
  24
  25 #include <net/netlink.h>
  26 #include <net/pkt_sched.h>
  27
  28 #define VERSION "1.3"
  29
  30 /*      Network Emulation Queuing algorithm.
  31         ====================================
  32
  33         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  34                  Network Emulation Tool
  35                  [2] Luigi Rizzo, DummyNet for FreeBSD
  36
  37          ----------------------------------------------------------------
  38
  39          This started out as a simple way to delay outgoing packets to
  40          test TCP but has grown to include most of the functionality
  41          of a full blown network emulator like NISTnet. It can delay
  42          packets and add random jitter (and correlation). The random
  43          distribution can be loaded from a table as well to provide
  44          normal, Pareto, or experimental curves. Packet loss,
  45          duplication, and reordering can also be emulated.
  46
  47          This qdisc does not do classification that can be handled in
  48          layering other disciplines.  It does not need to do bandwidth
  49          control either since that can be handled by using token
  50          bucket or other rate control.
  51
  52      Correlated Loss Generator models
  53
  54         Added generation of correlated loss according to the
  55         "Gilbert-Elliot" model, a 4-state markov model.
  56
  57         References:
  58         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  59         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  60         and intuitive loss model for packet networks and its implementation
  61         in the Netem module in the Linux kernel", available in [1]
  62
  63         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  64                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  65 */
  66
  67 struct netem_sched_data {
  68         struct Qdisc    *qdisc;
  69         struct qdisc_watchdog watchdog;
  70
  71         psched_tdiff_t latency;
  72         psched_tdiff_t jitter;
  73
  74         u32 loss;
  75         u32 limit;
  76         u32 counter;
  77         u32 gap;
  78         u32 duplicate;
  79         u32 reorder;
  80         u32 corrupt;
  81
  82         struct crndstate {
  83                 u32 last;
  84                 u32 rho;
  85         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  86
  87         struct disttable {
  88                 u32  size;
  89                 s16 table[0];
  90         } *delay_dist;
  91
  92         enum  {
  93                 CLG_RANDOM,
  94                 CLG_4_STATES,
  95                 CLG_GILB_ELL,
  96         } loss_model;
  97
  98         /* Correlated Loss Generation models */
  99         struct clgstate {
 100                 /* state of the Markov chain */
 101                 u8 state;
 102
 103                 /* 4-states and Gilbert-Elliot models */
 104                 u32 a1; /* p13 for 4-states or p for GE */
 105                 u32 a2; /* p31 for 4-states or r for GE */
 106                 u32 a3; /* p32 for 4-states or h for GE */
 107                 u32 a4; /* p14 for 4-states or 1-k for GE */
 108                 u32 a5; /* p23 used only in 4-states */
 109         } clg;
 110
 111 };
 112
 113 /* Time stamp put into socket buffer control block */
 114 struct netem_skb_cb {
 115         psched_time_t   time_to_send;
 116 };
 117
 118 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 119 {
 120         BUILD_BUG_ON(sizeof(skb->cb) <
 121                 sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
 122         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 123 }
 124
 125 /* init_crandom - initialize correlated random number generator
 126  * Use entropy source for initial seed.
 127  */
 128 static void init_crandom(struct crndstate *state, unsigned long rho)
 129 {
 130         state->rho = rho;
 131         state->last = net_random();
 132 }
 133
 134 /* get_crandom - correlated random number generator
 135  * Next number depends on last value.
 136  * rho is scaled to avoid floating point.
 137  */
 138 static u32 get_crandom(struct crndstate *state)
 139 {
 140         u64 value, rho;
 141         unsigned long answer;
 142
 143         if (state->rho == 0)    /* no correlation */
 144                 return net_random();
 145
 146         value = net_random();
 147         rho = (u64)state->rho + 1;
 148         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 149         state->last = answer;
 150         return answer;
 151 }
 152
 153 /* loss_4state - 4-state model loss generator
 154  * Generates losses according to the 4-state Markov chain adopted in
 155  * the GI (General and Intuitive) loss model.
 156  */
 157 static bool loss_4state(struct netem_sched_data *q)
 158 {
 159         struct clgstate *clg = &q->clg;
 160         u32 rnd = net_random();
 161
 162         /*
 163          * Makes a comparision between rnd and the transition
 164          * probabilities outgoing from the current state, then decides the
 165          * next state and if the next packet has to be transmitted or lost.
 166          * The four states correspond to:
 167          *   1 => successfully transmitted packets within a gap period
 168          *   4 => isolated losses within a gap period
 169          *   3 => lost packets within a burst period
 170          *   2 => successfully transmitted packets within a burst period
 171          */
 172         switch (clg->state) {
 173         case 1:
 174                 if (rnd < clg->a4) {
 175                         clg->state = 4;
 176                         return true;
 177                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 178                         clg->state = 3;
 179                         return true;
 180                 } else if (clg->a1 < rnd)
 181                         clg->state = 1;
 182
 183                 break;
 184         case 2:
 185                 if (rnd < clg->a5) {
 186                         clg->state = 3;
 187                         return true;
 188                 } else
 189                         clg->state = 2;
 190
 191                 break;
 192         case 3:
 193                 if (rnd < clg->a3)
 194                         clg->state = 2;
 195                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 196                         clg->state = 1;
 197                         return true;
 198                 } else if (clg->a2 + clg->a3 < rnd) {
 199                         clg->state = 3;
 200                         return true;
 201                 }
 202                 break;
 203         case 4:
 204                 clg->state = 1;
 205                 break;
 206         }
 207
 208         return false;
 209 }
 210
 211 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 212  * Generates losses according to the Gilbert-Elliot loss model or
 213  * its special cases  (Gilbert or Simple Gilbert)
 214  *
 215  * Makes a comparision between random number and the transition
 216  * probabilities outgoing from the current state, then decides the
 217  * next state. A second random number is extracted and the comparision
 218  * with the loss probability of the current state decides if the next
 219  * packet will be transmitted or lost.
 220  */
 221 static bool loss_gilb_ell(struct netem_sched_data *q)
 222 {
 223         struct clgstate *clg = &q->clg;
 224
 225         switch (clg->state) {
 226         case 1:
 227                 if (net_random() < clg->a1)
 228                         clg->state = 2;
 229                 if (net_random() < clg->a4)
 230                         return true;
 231         case 2:
 232                 if (net_random() < clg->a2)
 233                         clg->state = 1;
 234                 if (clg->a3 > net_random())
 235                         return true;
 236         }
 237
 238         return false;
 239 }
 240
 241 static bool loss_event(struct netem_sched_data *q)
 242 {
 243         switch (q->loss_model) {
 244         case CLG_RANDOM:
 245                 /* Random packet drop 0 => none, ~0 => all */
 246                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 247
 248         case CLG_4_STATES:
 249                 /* 4state loss model algorithm (used also for GI model)
 250                 * Extracts a value from the markov 4 state loss generator,
 251                 * if it is 1 drops a packet and if needed writes the event in
 252                 * the kernel logs
 253                 */
 254                 return loss_4state(q);
 255
 256         case CLG_GILB_ELL:
 257                 /* Gilbert-Elliot loss model algorithm
 258                 * Extracts a value from the Gilbert-Elliot loss generator,
 259                 * if it is 1 drops a packet and if needed writes the event in
 260                 * the kernel logs
 261                 */
 262                 return loss_gilb_ell(q);
 263         }
 264
 265         return false;   /* not reached */
 266 }
 267
 268
 269 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 270  * std deviation sigma.  Uses table lookup to approximate the desired
 271  * distribution, and a uniformly-distributed pseudo-random source.
 272  */
 273 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 274                                 struct crndstate *state,
 275                                 const struct disttable *dist)
 276 {
 277         psched_tdiff_t x;
 278         long t;
 279         u32 rnd;
 280
 281         if (sigma == 0)
 282                 return mu;
 283
 284         rnd = get_crandom(state);
 285
 286         /* default uniform distribution */
 287         if (dist == NULL)
 288                 return (rnd % (2*sigma)) - sigma + mu;
 289
 290         t = dist->table[rnd % dist->size];
 291         x = (sigma % NETEM_DIST_SCALE) * t;
 292         if (x >= 0)
 293                 x += NETEM_DIST_SCALE/2;
 294         else
 295                 x -= NETEM_DIST_SCALE/2;
 296
 297         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 298 }
 299
 300 /*
 301  * Insert one skb into qdisc.
 302  * Note: parent depends on return value to account for queue length.
 303  *      NET_XMIT_DROP: queue length didn't change.
 304  *      NET_XMIT_SUCCESS: one skb was queued.
 305  */
 306 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 307 {
 308         struct netem_sched_data *q = qdisc_priv(sch);
 309         /* We don't fill cb now as skb_unshare() may invalidate it */
 310         struct netem_skb_cb *cb;
 311         struct sk_buff *skb2;
 312         int ret;
 313         int count = 1;
 314
 315         /* Random duplication */
 316         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 317                 ++count;
 318
 319         /* Drop packet? */
 320         if (loss_event(q))
 321                 --count;
 322
 323         if (count == 0) {
 324                 sch->qstats.drops++;
 325                 kfree_skb(skb);
 326                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 327         }
 328
 329         skb_orphan(skb);
 330
 331         /*
 332          * If we need to duplicate packet, then re-insert at top of the
 333          * qdisc tree, since parent queuer expects that only one
 334          * skb will be queued.
 335          */
 336         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 337                 struct Qdisc *rootq = qdisc_root(sch);
 338                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 339                 q->duplicate = 0;
 340
 341                 qdisc_enqueue_root(skb2, rootq);
 342                 q->duplicate = dupsave;
 343         }
 344
 345         /*
 346          * Randomized packet corruption.
 347          * Make copy if needed since we are modifying
 348          * If packet is going to be hardware checksummed, then
 349          * do it now in software before we mangle it.
 350          */
 351         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 352                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 353                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 354                      skb_checksum_help(skb))) {
 355                         sch->qstats.drops++;
 356                         return NET_XMIT_DROP;
 357                 }
 358
 359                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 360         }
 361
 362         cb = netem_skb_cb(skb);
 363         if (q->gap == 0 ||              /* not doing reordering */
 364             q->counter < q->gap ||      /* inside last reordering gap */
 365             q->reorder < get_crandom(&q->reorder_cor)) {
 366                 psched_time_t now;
 367                 psched_tdiff_t delay;
 368
 369                 delay = tabledist(q->latency, q->jitter,
 370                                   &q->delay_cor, q->delay_dist);
 371
 372                 now = psched_get_time();
 373                 cb->time_to_send = now + delay;
 374                 ++q->counter;
 375                 ret = qdisc_enqueue(skb, q->qdisc);
 376         } else {
 377                 /*
 378                  * Do re-ordering by putting one out of N packets at the front
 379                  * of the queue.
 380                  */
 381                 cb->time_to_send = psched_get_time();
 382                 q->counter = 0;
 383
 384                 __skb_queue_head(&q->qdisc->q, skb);
 385                 q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 386                 q->qdisc->qstats.requeues++;
 387                 ret = NET_XMIT_SUCCESS;
 388         }
 389
 390         if (ret != NET_XMIT_SUCCESS) {
 391                 if (net_xmit_drop_count(ret)) {
 392                         sch->qstats.drops++;
 393                         return ret;
 394                 }
 395         }
 396
 397         sch->q.qlen++;
 398         return NET_XMIT_SUCCESS;
 399 }
 400
 401 static unsigned int netem_drop(struct Qdisc *sch)
 402 {
 403         struct netem_sched_data *q = qdisc_priv(sch);
 404         unsigned int len = 0;
 405
 406         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 407                 sch->q.qlen--;
 408                 sch->qstats.drops++;
 409         }
 410         return len;
 411 }
 412
 413 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 414 {
 415         struct netem_sched_data *q = qdisc_priv(sch);
 416         struct sk_buff *skb;
 417
 418         if (qdisc_is_throttled(sch))
 419                 return NULL;
 420
 421         skb = q->qdisc->ops->peek(q->qdisc);
 422         if (skb) {
 423                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 424                 psched_time_t now = psched_get_time();
 425
 426                 /* if more time remaining? */
 427                 if (cb->time_to_send <= now) {
 428                         skb = qdisc_dequeue_peeked(q->qdisc);
 429                         if (unlikely(!skb))
 430                                 return NULL;
 431
 432 #ifdef CONFIG_NET_CLS_ACT
 433                         /*
 434                          * If it's at ingress let's pretend the delay is
 435                          * from the network (tstamp will be updated).
 436                          */
 437                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 438                                 skb->tstamp.tv64 = 0;
 439 #endif
 440
 441                         sch->q.qlen--;
 442                         qdisc_unthrottled(sch);
 443                         qdisc_bstats_update(sch, skb);
 444                         return skb;
 445                 }
 446
 447                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 448         }
 449
 450         return NULL;
 451 }
 452
 453 static void netem_reset(struct Qdisc *sch)
 454 {
 455         struct netem_sched_data *q = qdisc_priv(sch);
 456
 457         qdisc_reset(q->qdisc);
 458         sch->q.qlen = 0;
 459         qdisc_watchdog_cancel(&q->watchdog);
 460 }
 461
 462 static void dist_free(struct disttable *d)
 463 {
 464         if (d) {
 465                 if (is_vmalloc_addr(d))
 466                         vfree(d);
 467                 else
 468                         kfree(d);
 469         }
 470 }
 471
 472 /*
 473  * Distribution data is a variable size payload containing
 474  * signed 16 bit values.
 475  */
 476 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 477 {
 478         struct netem_sched_data *q = qdisc_priv(sch);
 479         size_t n = nla_len(attr)/sizeof(__s16);
 480         const __s16 *data = nla_data(attr);
 481         spinlock_t *root_lock;
 482         struct disttable *d;
 483         int i;
 484         size_t s;
 485
 486         if (n > NETEM_DIST_MAX)
 487                 return -EINVAL;
 488
 489         s = sizeof(struct disttable) + n * sizeof(s16);
 490         d = kmalloc(s, GFP_KERNEL);
 491         if (!d)
 492                 d = vmalloc(s);
 493         if (!d)
 494                 return -ENOMEM;
 495
 496         d->size = n;
 497         for (i = 0; i < n; i++)
 498                 d->table[i] = data[i];
 499
 500         root_lock = qdisc_root_sleeping_lock(sch);
 501
 502         spin_lock_bh(root_lock);
 503         dist_free(q->delay_dist);
 504         q->delay_dist = d;
 505         spin_unlock_bh(root_lock);
 506         return 0;
 507 }
 508
 509 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 510 {
 511         struct netem_sched_data *q = qdisc_priv(sch);
 512         const struct tc_netem_corr *c = nla_data(attr);
 513
 514         init_crandom(&q->delay_cor, c->delay_corr);
 515         init_crandom(&q->loss_cor, c->loss_corr);
 516         init_crandom(&q->dup_cor, c->dup_corr);
 517 }
 518
 519 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 520 {
 521         struct netem_sched_data *q = qdisc_priv(sch);
 522         const struct tc_netem_reorder *r = nla_data(attr);
 523
 524         q->reorder = r->probability;
 525         init_crandom(&q->reorder_cor, r->correlation);
 526 }
 527
 528 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 529 {
 530         struct netem_sched_data *q = qdisc_priv(sch);
 531         const struct tc_netem_corrupt *r = nla_data(attr);
 532
 533         q->corrupt = r->probability;
 534         init_crandom(&q->corrupt_cor, r->correlation);
 535 }
 536
 537 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 538 {
 539         struct netem_sched_data *q = qdisc_priv(sch);
 540         const struct nlattr *la;
 541         int rem;
 542
 543         nla_for_each_nested(la, attr, rem) {
 544                 u16 type = nla_type(la);
 545
 546                 switch(type) {
 547                 case NETEM_LOSS_GI: {
 548                         const struct tc_netem_gimodel *gi = nla_data(la);
 549
 550                         if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
 551                                 pr_info("netem: incorrect gi model size\n");
 552                                 return -EINVAL;
 553                         }
 554
 555                         q->loss_model = CLG_4_STATES;
 556
 557                         q->clg.state = 1;
 558                         q->clg.a1 = gi->p13;
 559                         q->clg.a2 = gi->p31;
 560                         q->clg.a3 = gi->p32;
 561                         q->clg.a4 = gi->p14;
 562                         q->clg.a5 = gi->p23;
 563                         break;
 564                 }
 565
 566                 case NETEM_LOSS_GE: {
 567                         const struct tc_netem_gemodel *ge = nla_data(la);
 568
 569                         if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
 570                                 pr_info("netem: incorrect gi model size\n");
 571                                 return -EINVAL;
 572                         }
 573
 574                         q->loss_model = CLG_GILB_ELL;
 575                         q->clg.state = 1;
 576                         q->clg.a1 = ge->p;
 577                         q->clg.a2 = ge->r;
 578                         q->clg.a3 = ge->h;
 579                         q->clg.a4 = ge->k1;
 580                         break;
 581                 }
 582
 583                 default:
 584                         pr_info("netem: unknown loss type %u\n", type);
 585                         return -EINVAL;
 586                 }
 587         }
 588
 589         return 0;
 590 }
 591
 592 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 593         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 594         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 595         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 596         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 597 };
 598
 599 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 600                       const struct nla_policy *policy, int len)
 601 {
 602         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 603
 604         if (nested_len < 0) {
 605                 pr_info("netem: invalid attributes len %d\n", nested_len);
 606                 return -EINVAL;
 607         }
 608
 609         if (nested_len >= nla_attr_size(0))
 610                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 611                                  nested_len, policy);
 612
 613         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 614         return 0;
 615 }
 616
 617 /* Parse netlink message to set options */
 618 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 619 {
 620         struct netem_sched_data *q = qdisc_priv(sch);
 621         struct nlattr *tb[TCA_NETEM_MAX + 1];
 622         struct tc_netem_qopt *qopt;
 623         int ret;
 624
 625         if (opt == NULL)
 626                 return -EINVAL;
 627
 628         qopt = nla_data(opt);
 629         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 630         if (ret < 0)
 631                 return ret;
 632
 633         ret = fifo_set_limit(q->qdisc, qopt->limit);
 634         if (ret) {
 635                 pr_info("netem: can't set fifo limit\n");
 636                 return ret;
 637         }
 638
 639         q->latency = qopt->latency;
 640         q->jitter = qopt->jitter;
 641         q->limit = qopt->limit;
 642         q->gap = qopt->gap;
 643         q->counter = 0;
 644         q->loss = qopt->loss;
 645         q->duplicate = qopt->duplicate;
 646
 647         /* for compatibility with earlier versions.
 648          * if gap is set, need to assume 100% probability
 649          */
 650         if (q->gap)
 651                 q->reorder = ~0;
 652
 653         if (tb[TCA_NETEM_CORR])
 654                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 655
 656         if (tb[TCA_NETEM_DELAY_DIST]) {
 657                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 658                 if (ret)
 659                         return ret;
 660         }
 661
 662         if (tb[TCA_NETEM_REORDER])
 663                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 664
 665         if (tb[TCA_NETEM_CORRUPT])
 666                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 667
 668         q->loss_model = CLG_RANDOM;
 669         if (tb[TCA_NETEM_LOSS])
 670                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 671
 672         return ret;
 673 }
 674
 675 /*
 676  * Special case version of FIFO queue for use by netem.
 677  * It queues in order based on timestamps in skb's
 678  */
 679 struct fifo_sched_data {
 680         u32 limit;
 681         psched_time_t oldest;
 682 };
 683
 684 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 685 {
 686         struct fifo_sched_data *q = qdisc_priv(sch);
 687         struct sk_buff_head *list = &sch->q;
 688         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 689         struct sk_buff *skb;
 690
 691         if (likely(skb_queue_len(list) < q->limit)) {
 692                 /* Optimize for add at tail */
 693                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 694                         q->oldest = tnext;
 695                         return qdisc_enqueue_tail(nskb, sch);
 696                 }
 697
 698                 skb_queue_reverse_walk(list, skb) {
 699                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 700
 701                         if (tnext >= cb->time_to_send)
 702                                 break;
 703                 }
 704
 705                 __skb_queue_after(list, skb, nskb);
 706
 707                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 708
 709                 return NET_XMIT_SUCCESS;
 710         }
 711
 712         return qdisc_reshape_fail(nskb, sch);
 713 }
 714
 715 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 716 {
 717         struct fifo_sched_data *q = qdisc_priv(sch);
 718
 719         if (opt) {
 720                 struct tc_fifo_qopt *ctl = nla_data(opt);
 721                 if (nla_len(opt) < sizeof(*ctl))
 722                         return -EINVAL;
 723
 724                 q->limit = ctl->limit;
 725         } else
 726                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 727
 728         q->oldest = PSCHED_PASTPERFECT;
 729         return 0;
 730 }
 731
 732 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 733 {
 734         struct fifo_sched_data *q = qdisc_priv(sch);
 735         struct tc_fifo_qopt opt = { .limit = q->limit };
 736
 737         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 738         return skb->len;
 739
 740 nla_put_failure:
 741         return -1;
 742 }
 743
 744 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 745         .id             =       "tfifo",
 746         .priv_size      =       sizeof(struct fifo_sched_data),
 747         .enqueue        =       tfifo_enqueue,
 748         .dequeue        =       qdisc_dequeue_head,
 749         .peek           =       qdisc_peek_head,
 750         .drop           =       qdisc_queue_drop,
 751         .init           =       tfifo_init,
 752         .reset          =       qdisc_reset_queue,
 753         .change         =       tfifo_init,
 754         .dump           =       tfifo_dump,
 755 };
 756
 757 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 758 {
 759         struct netem_sched_data *q = qdisc_priv(sch);
 760         int ret;
 761
 762         if (!opt)
 763                 return -EINVAL;
 764
 765         qdisc_watchdog_init(&q->watchdog, sch);
 766
 767         q->loss_model = CLG_RANDOM;
 768         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 769                                      TC_H_MAKE(sch->handle, 1));
 770         if (!q->qdisc) {
 771                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 772                 return -ENOMEM;
 773         }
 774
 775         ret = netem_change(sch, opt);
 776         if (ret) {
 777                 pr_info("netem: change failed\n");
 778                 qdisc_destroy(q->qdisc);
 779         }
 780         return ret;
 781 }
 782
 783 static void netem_destroy(struct Qdisc *sch)
 784 {
 785         struct netem_sched_data *q = qdisc_priv(sch);
 786
 787         qdisc_watchdog_cancel(&q->watchdog);
 788         qdisc_destroy(q->qdisc);
 789         dist_free(q->delay_dist);
 790 }
 791
 792 static int dump_loss_model(const struct netem_sched_data *q,
 793                            struct sk_buff *skb)
 794 {
 795         struct nlattr *nest;
 796
 797         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 798         if (nest == NULL)
 799                 goto nla_put_failure;
 800
 801         switch (q->loss_model) {
 802         case CLG_RANDOM:
 803                 /* legacy loss model */
 804                 nla_nest_cancel(skb, nest);
 805                 return 0;       /* no data */
 806
 807         case CLG_4_STATES: {
 808                 struct tc_netem_gimodel gi = {
 809                         .p13 = q->clg.a1,
 810                         .p31 = q->clg.a2,
 811                         .p32 = q->clg.a3,
 812                         .p14 = q->clg.a4,
 813                         .p23 = q->clg.a5,
 814                 };
 815
 816                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 817                 break;
 818         }
 819         case CLG_GILB_ELL: {
 820                 struct tc_netem_gemodel ge = {
 821                         .p = q->clg.a1,
 822                         .r = q->clg.a2,
 823                         .h = q->clg.a3,
 824                         .k1 = q->clg.a4,
 825                 };
 826
 827                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 828                 break;
 829         }
 830         }
 831
 832         nla_nest_end(skb, nest);
 833         return 0;
 834
 835 nla_put_failure:
 836         nla_nest_cancel(skb, nest);
 837         return -1;
 838 }
 839
 840 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 841 {
 842         const struct netem_sched_data *q = qdisc_priv(sch);
 843         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 844         struct tc_netem_qopt qopt;
 845         struct tc_netem_corr cor;
 846         struct tc_netem_reorder reorder;
 847         struct tc_netem_corrupt corrupt;
 848
 849         qopt.latency = q->latency;
 850         qopt.jitter = q->jitter;
 851         qopt.limit = q->limit;
 852         qopt.loss = q->loss;
 853         qopt.gap = q->gap;
 854         qopt.duplicate = q->duplicate;
 855         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 856
 857         cor.delay_corr = q->delay_cor.rho;
 858         cor.loss_corr = q->loss_cor.rho;
 859         cor.dup_corr = q->dup_cor.rho;
 860         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 861
 862         reorder.probability = q->reorder;
 863         reorder.correlation = q->reorder_cor.rho;
 864         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 865
 866         corrupt.probability = q->corrupt;
 867         corrupt.correlation = q->corrupt_cor.rho;
 868         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 869
 870         if (dump_loss_model(q, skb) != 0)
 871                 goto nla_put_failure;
 872
 873         return nla_nest_end(skb, nla);
 874
 875 nla_put_failure:
 876         nlmsg_trim(skb, nla);
 877         return -1;
 878 }
 879
 880 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 881                           struct sk_buff *skb, struct tcmsg *tcm)
 882 {
 883         struct netem_sched_data *q = qdisc_priv(sch);
 884
 885         if (cl != 1)    /* only one class */
 886                 return -ENOENT;
 887
 888         tcm->tcm_handle |= TC_H_MIN(1);
 889         tcm->tcm_info = q->qdisc->handle;
 890
 891         return 0;
 892 }
 893
 894 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 895                      struct Qdisc **old)
 896 {
 897         struct netem_sched_data *q = qdisc_priv(sch);
 898
 899         if (new == NULL)
 900                 new = &noop_qdisc;
 901
 902         sch_tree_lock(sch);
 903         *old = q->qdisc;
 904         q->qdisc = new;
 905         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 906         qdisc_reset(*old);
 907         sch_tree_unlock(sch);
 908
 909         return 0;
 910 }
 911
 912 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 913 {
 914         struct netem_sched_data *q = qdisc_priv(sch);
 915         return q->qdisc;
 916 }
 917
 918 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 919 {
 920         return 1;
 921 }
 922
 923 static void netem_put(struct Qdisc *sch, unsigned long arg)
 924 {
 925 }
 926
 927 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 928 {
 929         if (!walker->stop) {
 930                 if (walker->count >= walker->skip)
 931                         if (walker->fn(sch, 1, walker) < 0) {
 932                                 walker->stop = 1;
 933                                 return;
 934                         }
 935                 walker->count++;
 936         }
 937 }
 938
 939 static const struct Qdisc_class_ops netem_class_ops = {
 940         .graft          =       netem_graft,
 941         .leaf           =       netem_leaf,
 942         .get            =       netem_get,
 943         .put            =       netem_put,
 944         .walk           =       netem_walk,
 945         .dump           =       netem_dump_class,
 946 };
 947
 948 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 949         .id             =       "netem",
 950         .cl_ops         =       &netem_class_ops,
 951         .priv_size      =       sizeof(struct netem_sched_data),
 952         .enqueue        =       netem_enqueue,
 953         .dequeue        =       netem_dequeue,
 954         .peek           =       qdisc_peek_dequeued,
 955         .drop           =       netem_drop,
 956         .init           =       netem_init,
 957         .reset          =       netem_reset,
 958         .destroy        =       netem_destroy,
 959         .change         =       netem_change,
 960         .dump           =       netem_dump,
 961         .owner          =       THIS_MODULE,
 962 };
 963
 964
 965 static int __init netem_module_init(void)
 966 {
 967         pr_info("netem: version " VERSION "\n");
 968         return register_qdisc(&netem_qdisc_ops);
 969 }
 970 static void __exit netem_module_exit(void)
 971 {
 972         unregister_qdisc(&netem_qdisc_ops);
 973 }
 974 module_init(netem_module_init)
 975 module_exit(netem_module_exit)
 976 MODULE_LICENSE("GPL");