net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25
  26 #include <net/netlink.h>
  27 #include <net/pkt_sched.h>
  28
  29 #define VERSION "1.3"
  30
  31 /*      Network Emulation Queuing algorithm.
  32         ====================================
  33
  34         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  35                  Network Emulation Tool
  36                  [2] Luigi Rizzo, DummyNet for FreeBSD
  37
  38          ----------------------------------------------------------------
  39
  40          This started out as a simple way to delay outgoing packets to
  41          test TCP but has grown to include most of the functionality
  42          of a full blown network emulator like NISTnet. It can delay
  43          packets and add random jitter (and correlation). The random
  44          distribution can be loaded from a table as well to provide
  45          normal, Pareto, or experimental curves. Packet loss,
  46          duplication, and reordering can also be emulated.
  47
  48          This qdisc does not do classification that can be handled in
  49          layering other disciplines.  It does not need to do bandwidth
  50          control either since that can be handled by using token
  51          bucket or other rate control.
  52
  53      Correlated Loss Generator models
  54
  55         Added generation of correlated loss according to the
  56         "Gilbert-Elliot" model, a 4-state markov model.
  57
  58         References:
  59         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  60         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  61         and intuitive loss model for packet networks and its implementation
  62         in the Netem module in the Linux kernel", available in [1]
  63
  64         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  65                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  66 */
  67
  68 struct netem_sched_data {
  69         struct Qdisc    *qdisc;
  70         struct qdisc_watchdog watchdog;
  71
  72         psched_tdiff_t latency;
  73         psched_tdiff_t jitter;
  74
  75         u32 loss;
  76         u32 limit;
  77         u32 counter;
  78         u32 gap;
  79         u32 duplicate;
  80         u32 reorder;
  81         u32 corrupt;
  82
  83         struct crndstate {
  84                 u32 last;
  85                 u32 rho;
  86         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  87
  88         struct disttable {
  89                 u32  size;
  90                 s16 table[0];
  91         } *delay_dist;
  92
  93         enum  {
  94                 CLG_RANDOM,
  95                 CLG_4_STATES,
  96                 CLG_GILB_ELL,
  97         } loss_model;
  98
  99         /* Correlated Loss Generation models */
 100         struct clgstate {
 101                 /* state of the Markov chain */
 102                 u8 state;
 103
 104                 /* 4-states and Gilbert-Elliot models */
 105                 u32 a1; /* p13 for 4-states or p for GE */
 106                 u32 a2; /* p31 for 4-states or r for GE */
 107                 u32 a3; /* p32 for 4-states or h for GE */
 108                 u32 a4; /* p14 for 4-states or 1-k for GE */
 109                 u32 a5; /* p23 used only in 4-states */
 110         } clg;
 111
 112 };
 113
 114 /* Time stamp put into socket buffer control block */
 115 struct netem_skb_cb {
 116         psched_time_t   time_to_send;
 117 };
 118
 119 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 120 {
 121         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 122         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 123 }
 124
 125 /* init_crandom - initialize correlated random number generator
 126  * Use entropy source for initial seed.
 127  */
 128 static void init_crandom(struct crndstate *state, unsigned long rho)
 129 {
 130         state->rho = rho;
 131         state->last = net_random();
 132 }
 133
 134 /* get_crandom - correlated random number generator
 135  * Next number depends on last value.
 136  * rho is scaled to avoid floating point.
 137  */
 138 static u32 get_crandom(struct crndstate *state)
 139 {
 140         u64 value, rho;
 141         unsigned long answer;
 142
 143         if (state->rho == 0)    /* no correlation */
 144                 return net_random();
 145
 146         value = net_random();
 147         rho = (u64)state->rho + 1;
 148         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 149         state->last = answer;
 150         return answer;
 151 }
 152
 153 /* loss_4state - 4-state model loss generator
 154  * Generates losses according to the 4-state Markov chain adopted in
 155  * the GI (General and Intuitive) loss model.
 156  */
 157 static bool loss_4state(struct netem_sched_data *q)
 158 {
 159         struct clgstate *clg = &q->clg;
 160         u32 rnd = net_random();
 161
 162         /*
 163          * Makes a comparison between rnd and the transition
 164          * probabilities outgoing from the current state, then decides the
 165          * next state and if the next packet has to be transmitted or lost.
 166          * The four states correspond to:
 167          *   1 => successfully transmitted packets within a gap period
 168          *   4 => isolated losses within a gap period
 169          *   3 => lost packets within a burst period
 170          *   2 => successfully transmitted packets within a burst period
 171          */
 172         switch (clg->state) {
 173         case 1:
 174                 if (rnd < clg->a4) {
 175                         clg->state = 4;
 176                         return true;
 177                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 178                         clg->state = 3;
 179                         return true;
 180                 } else if (clg->a1 < rnd)
 181                         clg->state = 1;
 182
 183                 break;
 184         case 2:
 185                 if (rnd < clg->a5) {
 186                         clg->state = 3;
 187                         return true;
 188                 } else
 189                         clg->state = 2;
 190
 191                 break;
 192         case 3:
 193                 if (rnd < clg->a3)
 194                         clg->state = 2;
 195                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 196                         clg->state = 1;
 197                         return true;
 198                 } else if (clg->a2 + clg->a3 < rnd) {
 199                         clg->state = 3;
 200                         return true;
 201                 }
 202                 break;
 203         case 4:
 204                 clg->state = 1;
 205                 break;
 206         }
 207
 208         return false;
 209 }
 210
 211 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 212  * Generates losses according to the Gilbert-Elliot loss model or
 213  * its special cases  (Gilbert or Simple Gilbert)
 214  *
 215  * Makes a comparison between random number and the transition
 216  * probabilities outgoing from the current state, then decides the
 217  * next state. A second random number is extracted and the comparison
 218  * with the loss probability of the current state decides if the next
 219  * packet will be transmitted or lost.
 220  */
 221 static bool loss_gilb_ell(struct netem_sched_data *q)
 222 {
 223         struct clgstate *clg = &q->clg;
 224
 225         switch (clg->state) {
 226         case 1:
 227                 if (net_random() < clg->a1)
 228                         clg->state = 2;
 229                 if (net_random() < clg->a4)
 230                         return true;
 231         case 2:
 232                 if (net_random() < clg->a2)
 233                         clg->state = 1;
 234                 if (clg->a3 > net_random())
 235                         return true;
 236         }
 237
 238         return false;
 239 }
 240
 241 static bool loss_event(struct netem_sched_data *q)
 242 {
 243         switch (q->loss_model) {
 244         case CLG_RANDOM:
 245                 /* Random packet drop 0 => none, ~0 => all */
 246                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 247
 248         case CLG_4_STATES:
 249                 /* 4state loss model algorithm (used also for GI model)
 250                 * Extracts a value from the markov 4 state loss generator,
 251                 * if it is 1 drops a packet and if needed writes the event in
 252                 * the kernel logs
 253                 */
 254                 return loss_4state(q);
 255
 256         case CLG_GILB_ELL:
 257                 /* Gilbert-Elliot loss model algorithm
 258                 * Extracts a value from the Gilbert-Elliot loss generator,
 259                 * if it is 1 drops a packet and if needed writes the event in
 260                 * the kernel logs
 261                 */
 262                 return loss_gilb_ell(q);
 263         }
 264
 265         return false;   /* not reached */
 266 }
 267
 268
 269 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 270  * std deviation sigma.  Uses table lookup to approximate the desired
 271  * distribution, and a uniformly-distributed pseudo-random source.
 272  */
 273 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 274                                 struct crndstate *state,
 275                                 const struct disttable *dist)
 276 {
 277         psched_tdiff_t x;
 278         long t;
 279         u32 rnd;
 280
 281         if (sigma == 0)
 282                 return mu;
 283
 284         rnd = get_crandom(state);
 285
 286         /* default uniform distribution */
 287         if (dist == NULL)
 288                 return (rnd % (2*sigma)) - sigma + mu;
 289
 290         t = dist->table[rnd % dist->size];
 291         x = (sigma % NETEM_DIST_SCALE) * t;
 292         if (x >= 0)
 293                 x += NETEM_DIST_SCALE/2;
 294         else
 295                 x -= NETEM_DIST_SCALE/2;
 296
 297         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 298 }
 299
 300 /*
 301  * Insert one skb into qdisc.
 302  * Note: parent depends on return value to account for queue length.
 303  *      NET_XMIT_DROP: queue length didn't change.
 304  *      NET_XMIT_SUCCESS: one skb was queued.
 305  */
 306 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 307 {
 308         struct netem_sched_data *q = qdisc_priv(sch);
 309         /* We don't fill cb now as skb_unshare() may invalidate it */
 310         struct netem_skb_cb *cb;
 311         struct sk_buff *skb2;
 312         int ret;
 313         int count = 1;
 314
 315         /* Random duplication */
 316         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 317                 ++count;
 318
 319         /* Drop packet? */
 320         if (loss_event(q))
 321                 --count;
 322
 323         if (count == 0) {
 324                 sch->qstats.drops++;
 325                 kfree_skb(skb);
 326                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 327         }
 328
 329         skb_orphan(skb);
 330
 331         /*
 332          * If we need to duplicate packet, then re-insert at top of the
 333          * qdisc tree, since parent queuer expects that only one
 334          * skb will be queued.
 335          */
 336         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 337                 struct Qdisc *rootq = qdisc_root(sch);
 338                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 339                 q->duplicate = 0;
 340
 341                 qdisc_enqueue_root(skb2, rootq);
 342                 q->duplicate = dupsave;
 343         }
 344
 345         /*
 346          * Randomized packet corruption.
 347          * Make copy if needed since we are modifying
 348          * If packet is going to be hardware checksummed, then
 349          * do it now in software before we mangle it.
 350          */
 351         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 352                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 353                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 354                      skb_checksum_help(skb)))
 355                         return qdisc_drop(skb, sch);
 356
 357                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 358         }
 359
 360         cb = netem_skb_cb(skb);
 361         if (q->gap == 0 ||              /* not doing reordering */
 362             q->counter < q->gap ||      /* inside last reordering gap */
 363             q->reorder < get_crandom(&q->reorder_cor)) {
 364                 psched_time_t now;
 365                 psched_tdiff_t delay;
 366
 367                 delay = tabledist(q->latency, q->jitter,
 368                                   &q->delay_cor, q->delay_dist);
 369
 370                 now = psched_get_time();
 371                 cb->time_to_send = now + delay;
 372                 ++q->counter;
 373                 ret = qdisc_enqueue(skb, q->qdisc);
 374         } else {
 375                 /*
 376                  * Do re-ordering by putting one out of N packets at the front
 377                  * of the queue.
 378                  */
 379                 cb->time_to_send = psched_get_time();
 380                 q->counter = 0;
 381
 382                 __skb_queue_head(&q->qdisc->q, skb);
 383                 sch->qstats.backlog += qdisc_pkt_len(skb);
 384                 sch->qstats.requeues++;
 385                 ret = NET_XMIT_SUCCESS;
 386         }
 387
 388         if (ret != NET_XMIT_SUCCESS) {
 389                 if (net_xmit_drop_count(ret)) {
 390                         sch->qstats.drops++;
 391                         return ret;
 392                 }
 393         }
 394
 395         sch->q.qlen++;
 396         return NET_XMIT_SUCCESS;
 397 }
 398
 399 static unsigned int netem_drop(struct Qdisc *sch)
 400 {
 401         struct netem_sched_data *q = qdisc_priv(sch);
 402         unsigned int len = 0;
 403
 404         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 405                 sch->q.qlen--;
 406                 sch->qstats.drops++;
 407         }
 408         return len;
 409 }
 410
 411 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 412 {
 413         struct netem_sched_data *q = qdisc_priv(sch);
 414         struct sk_buff *skb;
 415
 416         if (qdisc_is_throttled(sch))
 417                 return NULL;
 418
 419         skb = q->qdisc->ops->peek(q->qdisc);
 420         if (skb) {
 421                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 422                 psched_time_t now = psched_get_time();
 423
 424                 /* if more time remaining? */
 425                 if (cb->time_to_send <= now) {
 426                         skb = qdisc_dequeue_peeked(q->qdisc);
 427                         if (unlikely(!skb))
 428                                 return NULL;
 429
 430 #ifdef CONFIG_NET_CLS_ACT
 431                         /*
 432                          * If it's at ingress let's pretend the delay is
 433                          * from the network (tstamp will be updated).
 434                          */
 435                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 436                                 skb->tstamp.tv64 = 0;
 437 #endif
 438
 439                         sch->q.qlen--;
 440                         qdisc_unthrottled(sch);
 441                         qdisc_bstats_update(sch, skb);
 442                         return skb;
 443                 }
 444
 445                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 446         }
 447
 448         return NULL;
 449 }
 450
 451 static void netem_reset(struct Qdisc *sch)
 452 {
 453         struct netem_sched_data *q = qdisc_priv(sch);
 454
 455         qdisc_reset(q->qdisc);
 456         sch->q.qlen = 0;
 457         qdisc_watchdog_cancel(&q->watchdog);
 458 }
 459
 460 static void dist_free(struct disttable *d)
 461 {
 462         if (d) {
 463                 if (is_vmalloc_addr(d))
 464                         vfree(d);
 465                 else
 466                         kfree(d);
 467         }
 468 }
 469
 470 /*
 471  * Distribution data is a variable size payload containing
 472  * signed 16 bit values.
 473  */
 474 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 475 {
 476         struct netem_sched_data *q = qdisc_priv(sch);
 477         size_t n = nla_len(attr)/sizeof(__s16);
 478         const __s16 *data = nla_data(attr);
 479         spinlock_t *root_lock;
 480         struct disttable *d;
 481         int i;
 482         size_t s;
 483
 484         if (n > NETEM_DIST_MAX)
 485                 return -EINVAL;
 486
 487         s = sizeof(struct disttable) + n * sizeof(s16);
 488         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 489         if (!d)
 490                 d = vmalloc(s);
 491         if (!d)
 492                 return -ENOMEM;
 493
 494         d->size = n;
 495         for (i = 0; i < n; i++)
 496                 d->table[i] = data[i];
 497
 498         root_lock = qdisc_root_sleeping_lock(sch);
 499
 500         spin_lock_bh(root_lock);
 501         swap(q->delay_dist, d);
 502         spin_unlock_bh(root_lock);
 503
 504         dist_free(d);
 505         return 0;
 506 }
 507
 508 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 509 {
 510         struct netem_sched_data *q = qdisc_priv(sch);
 511         const struct tc_netem_corr *c = nla_data(attr);
 512
 513         init_crandom(&q->delay_cor, c->delay_corr);
 514         init_crandom(&q->loss_cor, c->loss_corr);
 515         init_crandom(&q->dup_cor, c->dup_corr);
 516 }
 517
 518 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 519 {
 520         struct netem_sched_data *q = qdisc_priv(sch);
 521         const struct tc_netem_reorder *r = nla_data(attr);
 522
 523         q->reorder = r->probability;
 524         init_crandom(&q->reorder_cor, r->correlation);
 525 }
 526
 527 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 528 {
 529         struct netem_sched_data *q = qdisc_priv(sch);
 530         const struct tc_netem_corrupt *r = nla_data(attr);
 531
 532         q->corrupt = r->probability;
 533         init_crandom(&q->corrupt_cor, r->correlation);
 534 }
 535
 536 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 537 {
 538         struct netem_sched_data *q = qdisc_priv(sch);
 539         const struct nlattr *la;
 540         int rem;
 541
 542         nla_for_each_nested(la, attr, rem) {
 543                 u16 type = nla_type(la);
 544
 545                 switch(type) {
 546                 case NETEM_LOSS_GI: {
 547                         const struct tc_netem_gimodel *gi = nla_data(la);
 548
 549                         if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
 550                                 pr_info("netem: incorrect gi model size\n");
 551                                 return -EINVAL;
 552                         }
 553
 554                         q->loss_model = CLG_4_STATES;
 555
 556                         q->clg.state = 1;
 557                         q->clg.a1 = gi->p13;
 558                         q->clg.a2 = gi->p31;
 559                         q->clg.a3 = gi->p32;
 560                         q->clg.a4 = gi->p14;
 561                         q->clg.a5 = gi->p23;
 562                         break;
 563                 }
 564
 565                 case NETEM_LOSS_GE: {
 566                         const struct tc_netem_gemodel *ge = nla_data(la);
 567
 568                         if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
 569                                 pr_info("netem: incorrect gi model size\n");
 570                                 return -EINVAL;
 571                         }
 572
 573                         q->loss_model = CLG_GILB_ELL;
 574                         q->clg.state = 1;
 575                         q->clg.a1 = ge->p;
 576                         q->clg.a2 = ge->r;
 577                         q->clg.a3 = ge->h;
 578                         q->clg.a4 = ge->k1;
 579                         break;
 580                 }
 581
 582                 default:
 583                         pr_info("netem: unknown loss type %u\n", type);
 584                         return -EINVAL;
 585                 }
 586         }
 587
 588         return 0;
 589 }
 590
 591 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 592         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 593         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 594         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 595         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 596 };
 597
 598 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 599                       const struct nla_policy *policy, int len)
 600 {
 601         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 602
 603         if (nested_len < 0) {
 604                 pr_info("netem: invalid attributes len %d\n", nested_len);
 605                 return -EINVAL;
 606         }
 607
 608         if (nested_len >= nla_attr_size(0))
 609                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 610                                  nested_len, policy);
 611
 612         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 613         return 0;
 614 }
 615
 616 /* Parse netlink message to set options */
 617 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 618 {
 619         struct netem_sched_data *q = qdisc_priv(sch);
 620         struct nlattr *tb[TCA_NETEM_MAX + 1];
 621         struct tc_netem_qopt *qopt;
 622         int ret;
 623
 624         if (opt == NULL)
 625                 return -EINVAL;
 626
 627         qopt = nla_data(opt);
 628         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 629         if (ret < 0)
 630                 return ret;
 631
 632         ret = fifo_set_limit(q->qdisc, qopt->limit);
 633         if (ret) {
 634                 pr_info("netem: can't set fifo limit\n");
 635                 return ret;
 636         }
 637
 638         q->latency = qopt->latency;
 639         q->jitter = qopt->jitter;
 640         q->limit = qopt->limit;
 641         q->gap = qopt->gap;
 642         q->counter = 0;
 643         q->loss = qopt->loss;
 644         q->duplicate = qopt->duplicate;
 645
 646         /* for compatibility with earlier versions.
 647          * if gap is set, need to assume 100% probability
 648          */
 649         if (q->gap)
 650                 q->reorder = ~0;
 651
 652         if (tb[TCA_NETEM_CORR])
 653                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 654
 655         if (tb[TCA_NETEM_DELAY_DIST]) {
 656                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 657                 if (ret)
 658                         return ret;
 659         }
 660
 661         if (tb[TCA_NETEM_REORDER])
 662                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 663
 664         if (tb[TCA_NETEM_CORRUPT])
 665                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 666
 667         q->loss_model = CLG_RANDOM;
 668         if (tb[TCA_NETEM_LOSS])
 669                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 670
 671         return ret;
 672 }
 673
 674 /*
 675  * Special case version of FIFO queue for use by netem.
 676  * It queues in order based on timestamps in skb's
 677  */
 678 struct fifo_sched_data {
 679         u32 limit;
 680         psched_time_t oldest;
 681 };
 682
 683 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 684 {
 685         struct fifo_sched_data *q = qdisc_priv(sch);
 686         struct sk_buff_head *list = &sch->q;
 687         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 688         struct sk_buff *skb;
 689
 690         if (likely(skb_queue_len(list) < q->limit)) {
 691                 /* Optimize for add at tail */
 692                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 693                         q->oldest = tnext;
 694                         return qdisc_enqueue_tail(nskb, sch);
 695                 }
 696
 697                 skb_queue_reverse_walk(list, skb) {
 698                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 699
 700                         if (tnext >= cb->time_to_send)
 701                                 break;
 702                 }
 703
 704                 __skb_queue_after(list, skb, nskb);
 705
 706                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 707
 708                 return NET_XMIT_SUCCESS;
 709         }
 710
 711         return qdisc_reshape_fail(nskb, sch);
 712 }
 713
 714 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 715 {
 716         struct fifo_sched_data *q = qdisc_priv(sch);
 717
 718         if (opt) {
 719                 struct tc_fifo_qopt *ctl = nla_data(opt);
 720                 if (nla_len(opt) < sizeof(*ctl))
 721                         return -EINVAL;
 722
 723                 q->limit = ctl->limit;
 724         } else
 725                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 726
 727         q->oldest = PSCHED_PASTPERFECT;
 728         return 0;
 729 }
 730
 731 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 732 {
 733         struct fifo_sched_data *q = qdisc_priv(sch);
 734         struct tc_fifo_qopt opt = { .limit = q->limit };
 735
 736         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 737         return skb->len;
 738
 739 nla_put_failure:
 740         return -1;
 741 }
 742
 743 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 744         .id             =       "tfifo",
 745         .priv_size      =       sizeof(struct fifo_sched_data),
 746         .enqueue        =       tfifo_enqueue,
 747         .dequeue        =       qdisc_dequeue_head,
 748         .peek           =       qdisc_peek_head,
 749         .drop           =       qdisc_queue_drop,
 750         .init           =       tfifo_init,
 751         .reset          =       qdisc_reset_queue,
 752         .change         =       tfifo_init,
 753         .dump           =       tfifo_dump,
 754 };
 755
 756 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 757 {
 758         struct netem_sched_data *q = qdisc_priv(sch);
 759         int ret;
 760
 761         qdisc_watchdog_init(&q->watchdog, sch);
 762
 763         if (!opt)
 764                 return -EINVAL;
 765
 766         q->loss_model = CLG_RANDOM;
 767         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 768                                      TC_H_MAKE(sch->handle, 1));
 769         if (!q->qdisc) {
 770                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 771                 return -ENOMEM;
 772         }
 773
 774         ret = netem_change(sch, opt);
 775         if (ret) {
 776                 pr_info("netem: change failed\n");
 777                 qdisc_destroy(q->qdisc);
 778         }
 779         return ret;
 780 }
 781
 782 static void netem_destroy(struct Qdisc *sch)
 783 {
 784         struct netem_sched_data *q = qdisc_priv(sch);
 785
 786         qdisc_watchdog_cancel(&q->watchdog);
 787         qdisc_destroy(q->qdisc);
 788         dist_free(q->delay_dist);
 789 }
 790
 791 static int dump_loss_model(const struct netem_sched_data *q,
 792                            struct sk_buff *skb)
 793 {
 794         struct nlattr *nest;
 795
 796         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 797         if (nest == NULL)
 798                 goto nla_put_failure;
 799
 800         switch (q->loss_model) {
 801         case CLG_RANDOM:
 802                 /* legacy loss model */
 803                 nla_nest_cancel(skb, nest);
 804                 return 0;       /* no data */
 805
 806         case CLG_4_STATES: {
 807                 struct tc_netem_gimodel gi = {
 808                         .p13 = q->clg.a1,
 809                         .p31 = q->clg.a2,
 810                         .p32 = q->clg.a3,
 811                         .p14 = q->clg.a4,
 812                         .p23 = q->clg.a5,
 813                 };
 814
 815                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 816                 break;
 817         }
 818         case CLG_GILB_ELL: {
 819                 struct tc_netem_gemodel ge = {
 820                         .p = q->clg.a1,
 821                         .r = q->clg.a2,
 822                         .h = q->clg.a3,
 823                         .k1 = q->clg.a4,
 824                 };
 825
 826                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 827                 break;
 828         }
 829         }
 830
 831         nla_nest_end(skb, nest);
 832         return 0;
 833
 834 nla_put_failure:
 835         nla_nest_cancel(skb, nest);
 836         return -1;
 837 }
 838
 839 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 840 {
 841         const struct netem_sched_data *q = qdisc_priv(sch);
 842         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 843         struct tc_netem_qopt qopt;
 844         struct tc_netem_corr cor;
 845         struct tc_netem_reorder reorder;
 846         struct tc_netem_corrupt corrupt;
 847
 848         qopt.latency = q->latency;
 849         qopt.jitter = q->jitter;
 850         qopt.limit = q->limit;
 851         qopt.loss = q->loss;
 852         qopt.gap = q->gap;
 853         qopt.duplicate = q->duplicate;
 854         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 855
 856         cor.delay_corr = q->delay_cor.rho;
 857         cor.loss_corr = q->loss_cor.rho;
 858         cor.dup_corr = q->dup_cor.rho;
 859         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 860
 861         reorder.probability = q->reorder;
 862         reorder.correlation = q->reorder_cor.rho;
 863         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 864
 865         corrupt.probability = q->corrupt;
 866         corrupt.correlation = q->corrupt_cor.rho;
 867         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 868
 869         if (dump_loss_model(q, skb) != 0)
 870                 goto nla_put_failure;
 871
 872         return nla_nest_end(skb, nla);
 873
 874 nla_put_failure:
 875         nlmsg_trim(skb, nla);
 876         return -1;
 877 }
 878
 879 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 880                           struct sk_buff *skb, struct tcmsg *tcm)
 881 {
 882         struct netem_sched_data *q = qdisc_priv(sch);
 883
 884         if (cl != 1)    /* only one class */
 885                 return -ENOENT;
 886
 887         tcm->tcm_handle |= TC_H_MIN(1);
 888         tcm->tcm_info = q->qdisc->handle;
 889
 890         return 0;
 891 }
 892
 893 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 894                      struct Qdisc **old)
 895 {
 896         struct netem_sched_data *q = qdisc_priv(sch);
 897
 898         if (new == NULL)
 899                 new = &noop_qdisc;
 900
 901         sch_tree_lock(sch);
 902         *old = q->qdisc;
 903         q->qdisc = new;
 904         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 905         qdisc_reset(*old);
 906         sch_tree_unlock(sch);
 907
 908         return 0;
 909 }
 910
 911 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 912 {
 913         struct netem_sched_data *q = qdisc_priv(sch);
 914         return q->qdisc;
 915 }
 916
 917 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 918 {
 919         return 1;
 920 }
 921
 922 static void netem_put(struct Qdisc *sch, unsigned long arg)
 923 {
 924 }
 925
 926 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 927 {
 928         if (!walker->stop) {
 929                 if (walker->count >= walker->skip)
 930                         if (walker->fn(sch, 1, walker) < 0) {
 931                                 walker->stop = 1;
 932                                 return;
 933                         }
 934                 walker->count++;
 935         }
 936 }
 937
 938 static const struct Qdisc_class_ops netem_class_ops = {
 939         .graft          =       netem_graft,
 940         .leaf           =       netem_leaf,
 941         .get            =       netem_get,
 942         .put            =       netem_put,
 943         .walk           =       netem_walk,
 944         .dump           =       netem_dump_class,
 945 };
 946
 947 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 948         .id             =       "netem",
 949         .cl_ops         =       &netem_class_ops,
 950         .priv_size      =       sizeof(struct netem_sched_data),
 951         .enqueue        =       netem_enqueue,
 952         .dequeue        =       netem_dequeue,
 953         .peek           =       qdisc_peek_dequeued,
 954         .drop           =       netem_drop,
 955         .init           =       netem_init,
 956         .reset          =       netem_reset,
 957         .destroy        =       netem_destroy,
 958         .change         =       netem_change,
 959         .dump           =       netem_dump,
 960         .owner          =       THIS_MODULE,
 961 };
 962
 963
 964 static int __init netem_module_init(void)
 965 {
 966         pr_info("netem: version " VERSION "\n");
 967         return register_qdisc(&netem_qdisc_ops);
 968 }
 969 static void __exit netem_module_exit(void)
 970 {
 971         unregister_qdisc(&netem_qdisc_ops);
 972 }
 973 module_init(netem_module_init)
 974 module_exit(netem_module_exit)
 975 MODULE_LICENSE("GPL");