mm/slub.c

   1 /*
   2  * SLUB: A slab allocator that limits cache line use instead of queuing
   3  * objects in per cpu and per node lists.
   4  *
   5  * The allocator synchronizes using per slab locks and only
   6  * uses a centralized lock to manage a pool of partial slabs.
   7  *
   8  * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/module.h>
  13 #include <linux/bit_spinlock.h>
  14 #include <linux/interrupt.h>
  15 #include <linux/bitops.h>
  16 #include <linux/slab.h>
  17 #include <linux/seq_file.h>
  18 #include <linux/cpu.h>
  19 #include <linux/cpuset.h>
  20 #include <linux/mempolicy.h>
  21 #include <linux/ctype.h>
  22 #include <linux/kallsyms.h>
  23
  24 /*
  25  * Lock order:
  26  *   1. slab_lock(page)
  27  *   2. slab->list_lock
  28  *
  29  *   The slab_lock protects operations on the object of a particular
  30  *   slab and its metadata in the page struct. If the slab lock
  31  *   has been taken then no allocations nor frees can be performed
  32  *   on the objects in the slab nor can the slab be added or removed
  33  *   from the partial or full lists since this would mean modifying
  34  *   the page_struct of the slab.
  35  *
  36  *   The list_lock protects the partial and full list on each node and
  37  *   the partial slab counter. If taken then no new slabs may be added or
  38  *   removed from the lists nor make the number of partial slabs be modified.
  39  *   (Note that the total number of slabs is an atomic value that may be
  40  *   modified without taking the list lock).
  41  *
  42  *   The list_lock is a centralized lock and thus we avoid taking it as
  43  *   much as possible. As long as SLUB does not have to handle partial
  44  *   slabs, operations can continue without any centralized lock. F.e.
  45  *   allocating a long series of objects that fill up slabs does not require
  46  *   the list lock.
  47  *
  48  *   The lock order is sometimes inverted when we are trying to get a slab
  49  *   off a list. We take the list_lock and then look for a page on the list
  50  *   to use. While we do that objects in the slabs may be freed. We can
  51  *   only operate on the slab if we have also taken the slab_lock. So we use
  52  *   a slab_trylock() on the slab. If trylock was successful then no frees
  53  *   can occur anymore and we can use the slab for allocations etc. If the
  54  *   slab_trylock() does not succeed then frees are in progress in the slab and
  55  *   we must stay away from it for a while since we may cause a bouncing
  56  *   cacheline if we try to acquire the lock. So go onto the next slab.
  57  *   If all pages are busy then we may allocate a new slab instead of reusing
  58  *   a partial slab. A new slab has noone operating on it and thus there is
  59  *   no danger of cacheline contention.
  60  *
  61  *   Interrupts are disabled during allocation and deallocation in order to
  62  *   make the slab allocator safe to use in the context of an irq. In addition
  63  *   interrupts are disabled to ensure that the processor does not change
  64  *   while handling per_cpu slabs, due to kernel preemption.
  65  *
  66  * SLUB assigns one slab for allocation to each processor.
  67  * Allocations only occur from these slabs called cpu slabs.
  68  *
  69  * Slabs with free elements are kept on a partial list and during regular
  70  * operations no list for full slabs is used. If an object in a full slab is
  71  * freed then the slab will show up again on the partial lists.
  72  * We track full slabs for debugging purposes though because otherwise we
  73  * cannot scan all objects.
  74  *
  75  * Slabs are freed when they become empty. Teardown and setup is
  76  * minimal so we rely on the page allocators per cpu caches for
  77  * fast frees and allocs.
  78  *
  79  * Overloading of page flags that are otherwise used for LRU management.
  80  *
  81  * PageActive           The slab is used as a cpu cache. Allocations
  82  *                      may be performed from the slab. The slab is not
  83  *                      on any slab list and cannot be moved onto one.
  84  *
  85  * PageError            Slab requires special handling due to debug
  86  *                      options set. This moves slab handling out of
  87  *                      the fast path.
  88  */
  89
  90 /*
  91  * Issues still to be resolved:
  92  *
  93  * - The per cpu array is updated for each new slab and and is a remote
  94  *   cacheline for most nodes. This could become a bouncing cacheline given
  95  *   enough frequent updates. There are 16 pointers in a cacheline, so at
  96  *   max 16 cpus could compete for the cacheline which may be okay.
  97  *
  98  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  99  *
 100  * - Variable sizing of the per node arrays
 101  */
 102
 103 /* Enable to test recovery from slab corruption on boot */
 104 #undef SLUB_RESILIENCY_TEST
 105
 106 #if PAGE_SHIFT <= 12
 107
 108 /*
 109  * Small page size. Make sure that we do not fragment memory
 110  */
 111 #define DEFAULT_MAX_ORDER 1
 112 #define DEFAULT_MIN_OBJECTS 4
 113
 114 #else
 115
 116 /*
 117  * Large page machines are customarily able to handle larger
 118  * page orders.
 119  */
 120 #define DEFAULT_MAX_ORDER 2
 121 #define DEFAULT_MIN_OBJECTS 8
 122
 123 #endif
 124
 125 /*
 126  * Mininum number of partial slabs. These will be left on the partial
 127  * lists even if they are empty. kmem_cache_shrink may reclaim them.
 128  */
 129 #define MIN_PARTIAL 2
 130
 131 /*
 132  * Maximum number of desirable partial slabs.
 133  * The existence of more partial slabs makes kmem_cache_shrink
 134  * sort the partial list by the number of objects in the.
 135  */
 136 #define MAX_PARTIAL 10
 137
 138 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 139                                 SLAB_POISON | SLAB_STORE_USER)
 140
 141 /*
 142  * Set of flags that will prevent slab merging
 143  */
 144 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 145                 SLAB_TRACE | SLAB_DESTROY_BY_RCU)
 146
 147 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 148                 SLAB_CACHE_DMA)
 149
 150 #ifndef ARCH_KMALLOC_MINALIGN
 151 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 152 #endif
 153
 154 #ifndef ARCH_SLAB_MINALIGN
 155 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 156 #endif
 157
 158 /* Internal SLUB flags */
 159 #define __OBJECT_POISON 0x80000000      /* Poison object */
 160
 161 /* Not all arches define cache_line_size */
 162 #ifndef cache_line_size
 163 #define cache_line_size()       L1_CACHE_BYTES
 164 #endif
 165
 166 static int kmem_size = sizeof(struct kmem_cache);
 167
 168 #ifdef CONFIG_SMP
 169 static struct notifier_block slab_notifier;
 170 #endif
 171
 172 static enum {
 173         DOWN,           /* No slab functionality available */
 174         PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
 175         UP,             /* Everything works but does not show up in sysfs */
 176         SYSFS           /* Sysfs up */
 177 } slab_state = DOWN;
 178
 179 /* A list of all slab caches on the system */
 180 static DECLARE_RWSEM(slub_lock);
 181 LIST_HEAD(slab_caches);
 182
 183 #ifdef CONFIG_SYSFS
 184 static int sysfs_slab_add(struct kmem_cache *);
 185 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 186 static void sysfs_slab_remove(struct kmem_cache *);
 187 #else
 188 static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 189 static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
 190 static void sysfs_slab_remove(struct kmem_cache *s) {}
 191 #endif
 192
 193 /********************************************************************
 194  *                      Core slab cache functions
 195  *******************************************************************/
 196
 197 int slab_is_available(void)
 198 {
 199         return slab_state >= UP;
 200 }
 201
 202 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 203 {
 204 #ifdef CONFIG_NUMA
 205         return s->node[node];
 206 #else
 207         return &s->local_node;
 208 #endif
 209 }
 210
 211 /*
 212  * Object debugging
 213  */
 214 static void print_section(char *text, u8 *addr, unsigned int length)
 215 {
 216         int i, offset;
 217         int newline = 1;
 218         char ascii[17];
 219
 220         ascii[16] = 0;
 221
 222         for (i = 0; i < length; i++) {
 223                 if (newline) {
 224                         printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
 225                         newline = 0;
 226                 }
 227                 printk(" %02x", addr[i]);
 228                 offset = i % 16;
 229                 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 230                 if (offset == 15) {
 231                         printk(" %s\n",ascii);
 232                         newline = 1;
 233                 }
 234         }
 235         if (!newline) {
 236                 i %= 16;
 237                 while (i < 16) {
 238                         printk("   ");
 239                         ascii[i] = ' ';
 240                         i++;
 241                 }
 242                 printk(" %s\n", ascii);
 243         }
 244 }
 245
 246 /*
 247  * Slow version of get and set free pointer.
 248  *
 249  * This version requires touching the cache lines of kmem_cache which
 250  * we avoid to do in the fast alloc free paths. There we obtain the offset
 251  * from the page struct.
 252  */
 253 static void *get_freepointer(struct kmem_cache *s, void *object)
 254 {
 255         return *(void **)(object + s->offset);
 256 }
 257
 258 static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 259 {
 260         *(void **)(object + s->offset) = fp;
 261 }
 262
 263 /*
 264  * Tracking user of a slab.
 265  */
 266 struct track {
 267         void *addr;             /* Called from address */
 268         int cpu;                /* Was running on cpu */
 269         int pid;                /* Pid context */
 270         unsigned long when;     /* When did the operation occur */
 271 };
 272
 273 enum track_item { TRACK_ALLOC, TRACK_FREE };
 274
 275 static struct track *get_track(struct kmem_cache *s, void *object,
 276         enum track_item alloc)
 277 {
 278         struct track *p;
 279
 280         if (s->offset)
 281                 p = object + s->offset + sizeof(void *);
 282         else
 283                 p = object + s->inuse;
 284
 285         return p + alloc;
 286 }
 287
 288 static void set_track(struct kmem_cache *s, void *object,
 289                                 enum track_item alloc, void *addr)
 290 {
 291         struct track *p;
 292
 293         if (s->offset)
 294                 p = object + s->offset + sizeof(void *);
 295         else
 296                 p = object + s->inuse;
 297
 298         p += alloc;
 299         if (addr) {
 300                 p->addr = addr;
 301                 p->cpu = smp_processor_id();
 302                 p->pid = current ? current->pid : -1;
 303                 p->when = jiffies;
 304         } else
 305                 memset(p, 0, sizeof(struct track));
 306 }
 307
 308 static void init_tracking(struct kmem_cache *s, void *object)
 309 {
 310         if (s->flags & SLAB_STORE_USER) {
 311                 set_track(s, object, TRACK_FREE, NULL);
 312                 set_track(s, object, TRACK_ALLOC, NULL);
 313         }
 314 }
 315
 316 static void print_track(const char *s, struct track *t)
 317 {
 318         if (!t->addr)
 319                 return;
 320
 321         printk(KERN_ERR "%s: ", s);
 322         __print_symbol("%s", (unsigned long)t->addr);
 323         printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
 324 }
 325
 326 static void print_trailer(struct kmem_cache *s, u8 *p)
 327 {
 328         unsigned int off;       /* Offset of last byte */
 329
 330         if (s->flags & SLAB_RED_ZONE)
 331                 print_section("Redzone", p + s->objsize,
 332                         s->inuse - s->objsize);
 333
 334         printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
 335                         p + s->offset,
 336                         get_freepointer(s, p));
 337
 338         if (s->offset)
 339                 off = s->offset + sizeof(void *);
 340         else
 341                 off = s->inuse;
 342
 343         if (s->flags & SLAB_STORE_USER) {
 344                 print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
 345                 print_track("Last free ", get_track(s, p, TRACK_FREE));
 346                 off += 2 * sizeof(struct track);
 347         }
 348
 349         if (off != s->size)
 350                 /* Beginning of the filler is the free pointer */
 351                 print_section("Filler", p + off, s->size - off);
 352 }
 353
 354 static void object_err(struct kmem_cache *s, struct page *page,
 355                         u8 *object, char *reason)
 356 {
 357         u8 *addr = page_address(page);
 358
 359         printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
 360                         s->name, reason, object, page);
 361         printk(KERN_ERR "    offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
 362                 object - addr, page->flags, page->inuse, page->freelist);
 363         if (object > addr + 16)
 364                 print_section("Bytes b4", object - 16, 16);
 365         print_section("Object", object, min(s->objsize, 128));
 366         print_trailer(s, object);
 367         dump_stack();
 368 }
 369
 370 static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
 371 {
 372         va_list args;
 373         char buf[100];
 374
 375         va_start(args, reason);
 376         vsnprintf(buf, sizeof(buf), reason, args);
 377         va_end(args);
 378         printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
 379                 page);
 380         dump_stack();
 381 }
 382
 383 static void init_object(struct kmem_cache *s, void *object, int active)
 384 {
 385         u8 *p = object;
 386
 387         if (s->flags & __OBJECT_POISON) {
 388                 memset(p, POISON_FREE, s->objsize - 1);
 389                 p[s->objsize -1] = POISON_END;
 390         }
 391
 392         if (s->flags & SLAB_RED_ZONE)
 393                 memset(p + s->objsize,
 394                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 395                         s->inuse - s->objsize);
 396 }
 397
 398 static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 399 {
 400         while (bytes) {
 401                 if (*start != (u8)value)
 402                         return 0;
 403                 start++;
 404                 bytes--;
 405         }
 406         return 1;
 407 }
 408
 409 static inline int check_valid_pointer(struct kmem_cache *s,
 410                                 struct page *page, const void *object)
 411 {
 412         void *base;
 413
 414         if (!object)
 415                 return 1;
 416
 417         base = page_address(page);
 418         if (object < base || object >= base + s->objects * s->size ||
 419                 (object - base) % s->size) {
 420                 return 0;
 421         }
 422
 423         return 1;
 424 }
 425
 426 /*
 427  * Object layout:
 428  *
 429  * object address
 430  *      Bytes of the object to be managed.
 431  *      If the freepointer may overlay the object then the free
 432  *      pointer is the first word of the object.
 433  *
 434  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 435  *      0xa5 (POISON_END)
 436  *
 437  * object + s->objsize
 438  *      Padding to reach word boundary. This is also used for Redzoning.
 439  *      Padding is extended by another word if Redzoning is enabled and
 440  *      objsize == inuse.
 441  *
 442  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 443  *      0xcc (RED_ACTIVE) for objects in use.
 444  *
 445  * object + s->inuse
 446  *      Meta data starts here.
 447  *
 448  *      A. Free pointer (if we cannot overwrite object on free)
 449  *      B. Tracking data for SLAB_STORE_USER
 450  *      C. Padding to reach required alignment boundary or at mininum
 451  *              one word if debuggin is on to be able to detect writes
 452  *              before the word boundary.
 453  *
 454  *      Padding is done using 0x5a (POISON_INUSE)
 455  *
 456  * object + s->size
 457  *      Nothing is used beyond s->size.
 458  *
 459  * If slabcaches are merged then the objsize and inuse boundaries are mostly
 460  * ignored. And therefore no slab options that rely on these boundaries
 461  * may be used with merged slabcaches.
 462  */
 463
 464 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 465                                                 void *from, void *to)
 466 {
 467         printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
 468                 s->name, message, data, from, to - 1);
 469         memset(from, data, to - from);
 470 }
 471
 472 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 473 {
 474         unsigned long off = s->inuse;   /* The end of info */
 475
 476         if (s->offset)
 477                 /* Freepointer is placed after the object. */
 478                 off += sizeof(void *);
 479
 480         if (s->flags & SLAB_STORE_USER)
 481                 /* We also have user information there */
 482                 off += 2 * sizeof(struct track);
 483
 484         if (s->size == off)
 485                 return 1;
 486
 487         if (check_bytes(p + off, POISON_INUSE, s->size - off))
 488                 return 1;
 489
 490         object_err(s, page, p, "Object padding check fails");
 491
 492         /*
 493          * Restore padding
 494          */
 495         restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
 496         return 0;
 497 }
 498
 499 static int slab_pad_check(struct kmem_cache *s, struct page *page)
 500 {
 501         u8 *p;
 502         int length, remainder;
 503
 504         if (!(s->flags & SLAB_POISON))
 505                 return 1;
 506
 507         p = page_address(page);
 508         length = s->objects * s->size;
 509         remainder = (PAGE_SIZE << s->order) - length;
 510         if (!remainder)
 511                 return 1;
 512
 513         if (!check_bytes(p + length, POISON_INUSE, remainder)) {
 514                 slab_err(s, page, "Padding check failed");
 515                 restore_bytes(s, "slab padding", POISON_INUSE, p + length,
 516                         p + length + remainder);
 517                 return 0;
 518         }
 519         return 1;
 520 }
 521
 522 static int check_object(struct kmem_cache *s, struct page *page,
 523                                         void *object, int active)
 524 {
 525         u8 *p = object;
 526         u8 *endobject = object + s->objsize;
 527
 528         if (s->flags & SLAB_RED_ZONE) {
 529                 unsigned int red =
 530                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 531
 532                 if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
 533                         object_err(s, page, object,
 534                         active ? "Redzone Active" : "Redzone Inactive");
 535                         restore_bytes(s, "redzone", red,
 536                                 endobject, object + s->inuse);
 537                         return 0;
 538                 }
 539         } else {
 540                 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
 541                         !check_bytes(endobject, POISON_INUSE,
 542                                         s->inuse - s->objsize)) {
 543                 object_err(s, page, p, "Alignment padding check fails");
 544                 /*
 545                  * Fix it so that there will not be another report.
 546                  *
 547                  * Hmmm... We may be corrupting an object that now expects
 548                  * to be longer than allowed.
 549                  */
 550                 restore_bytes(s, "alignment padding", POISON_INUSE,
 551                         endobject, object + s->inuse);
 552                 }
 553         }
 554
 555         if (s->flags & SLAB_POISON) {
 556                 if (!active && (s->flags & __OBJECT_POISON) &&
 557                         (!check_bytes(p, POISON_FREE, s->objsize - 1) ||
 558                                 p[s->objsize - 1] != POISON_END)) {
 559
 560                         object_err(s, page, p, "Poison check failed");
 561                         restore_bytes(s, "Poison", POISON_FREE,
 562                                                 p, p + s->objsize -1);
 563                         restore_bytes(s, "Poison", POISON_END,
 564                                         p + s->objsize - 1, p + s->objsize);
 565                         return 0;
 566                 }
 567                 /*
 568                  * check_pad_bytes cleans up on its own.
 569                  */
 570                 check_pad_bytes(s, page, p);
 571         }
 572
 573         if (!s->offset && active)
 574                 /*
 575                  * Object and freepointer overlap. Cannot check
 576                  * freepointer while object is allocated.
 577                  */
 578                 return 1;
 579
 580         /* Check free pointer validity */
 581         if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 582                 object_err(s, page, p, "Freepointer corrupt");
 583                 /*
 584                  * No choice but to zap it and thus loose the remainder
 585                  * of the free objects in this slab. May cause
 586                  * another error because the object count is now wrong.
 587                  */
 588                 set_freepointer(s, p, NULL);
 589                 return 0;
 590         }
 591         return 1;
 592 }
 593
 594 static int check_slab(struct kmem_cache *s, struct page *page)
 595 {
 596         VM_BUG_ON(!irqs_disabled());
 597
 598         if (!PageSlab(page)) {
 599                 slab_err(s, page, "Not a valid slab page flags=%lx "
 600                         "mapping=0x%p count=%d", page->flags, page->mapping,
 601                         page_count(page));
 602                 return 0;
 603         }
 604         if (page->offset * sizeof(void *) != s->offset) {
 605                 slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
 606                         "mapping=0x%p count=%d",
 607                         (unsigned long)(page->offset * sizeof(void *)),
 608                         page->flags,
 609                         page->mapping,
 610                         page_count(page));
 611                 return 0;
 612         }
 613         if (page->inuse > s->objects) {
 614                 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
 615                         "mapping=0x%p count=%d",
 616                         s->name, page->inuse, s->objects, page->flags,
 617                         page->mapping, page_count(page));
 618                 return 0;
 619         }
 620         /* Slab_pad_check fixes things up after itself */
 621         slab_pad_check(s, page);
 622         return 1;
 623 }
 624
 625 /*
 626  * Determine if a certain object on a page is on the freelist. Must hold the
 627  * slab lock to guarantee that the chains are in a consistent state.
 628  */
 629 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 630 {
 631         int nr = 0;
 632         void *fp = page->freelist;
 633         void *object = NULL;
 634
 635         while (fp && nr <= s->objects) {
 636                 if (fp == search)
 637                         return 1;
 638                 if (!check_valid_pointer(s, page, fp)) {
 639                         if (object) {
 640                                 object_err(s, page, object,
 641                                         "Freechain corrupt");
 642                                 set_freepointer(s, object, NULL);
 643                                 break;
 644                         } else {
 645                                 slab_err(s, page, "Freepointer 0x%p corrupt",
 646                                                                         fp);
 647                                 page->freelist = NULL;
 648                                 page->inuse = s->objects;
 649                                 printk(KERN_ERR "@@@ SLUB %s: Freelist "
 650                                         "cleared. Slab 0x%p\n",
 651                                         s->name, page);
 652                                 return 0;
 653                         }
 654                         break;
 655                 }
 656                 object = fp;
 657                 fp = get_freepointer(s, object);
 658                 nr++;
 659         }
 660
 661         if (page->inuse != s->objects - nr) {
 662                 slab_err(s, page, "Wrong object count. Counter is %d but "
 663                         "counted were %d", s, page, page->inuse,
 664                                                         s->objects - nr);
 665                 page->inuse = s->objects - nr;
 666                 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
 667                         "Slab @0x%p\n", s->name, page);
 668         }
 669         return search == NULL;
 670 }
 671
 672 /*
 673  * Tracking of fully allocated slabs for debugging purposes.
 674  */
 675 static void add_full(struct kmem_cache_node *n, struct page *page)
 676 {
 677         spin_lock(&n->list_lock);
 678         list_add(&page->lru, &n->full);
 679         spin_unlock(&n->list_lock);
 680 }
 681
 682 static void remove_full(struct kmem_cache *s, struct page *page)
 683 {
 684         struct kmem_cache_node *n;
 685
 686         if (!(s->flags & SLAB_STORE_USER))
 687                 return;
 688
 689         n = get_node(s, page_to_nid(page));
 690
 691         spin_lock(&n->list_lock);
 692         list_del(&page->lru);
 693         spin_unlock(&n->list_lock);
 694 }
 695
 696 static int alloc_object_checks(struct kmem_cache *s, struct page *page,
 697                                                         void *object)
 698 {
 699         if (!check_slab(s, page))
 700                 goto bad;
 701
 702         if (object && !on_freelist(s, page, object)) {
 703                 slab_err(s, page, "Object 0x%p already allocated", object);
 704                 goto bad;
 705         }
 706
 707         if (!check_valid_pointer(s, page, object)) {
 708                 object_err(s, page, object, "Freelist Pointer check fails");
 709                 goto bad;
 710         }
 711
 712         if (!object)
 713                 return 1;
 714
 715         if (!check_object(s, page, object, 0))
 716                 goto bad;
 717
 718         return 1;
 719 bad:
 720         if (PageSlab(page)) {
 721                 /*
 722                  * If this is a slab page then lets do the best we can
 723                  * to avoid issues in the future. Marking all objects
 724                  * as used avoids touching the remaining objects.
 725                  */
 726                 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
 727                         s->name, page);
 728                 page->inuse = s->objects;
 729                 page->freelist = NULL;
 730                 /* Fix up fields that may be corrupted */
 731                 page->offset = s->offset / sizeof(void *);
 732         }
 733         return 0;
 734 }
 735
 736 static int free_object_checks(struct kmem_cache *s, struct page *page,
 737                                                         void *object)
 738 {
 739         if (!check_slab(s, page))
 740                 goto fail;
 741
 742         if (!check_valid_pointer(s, page, object)) {
 743                 slab_err(s, page, "Invalid object pointer 0x%p", object);
 744                 goto fail;
 745         }
 746
 747         if (on_freelist(s, page, object)) {
 748                 slab_err(s, page, "Object 0x%p already free", object);
 749                 goto fail;
 750         }
 751
 752         if (!check_object(s, page, object, 1))
 753                 return 0;
 754
 755         if (unlikely(s != page->slab)) {
 756                 if (!PageSlab(page))
 757                         slab_err(s, page, "Attempt to free object(0x%p) "
 758                                 "outside of slab", object);
 759                 else
 760                 if (!page->slab) {
 761                         printk(KERN_ERR
 762                                 "SLUB <none>: no slab for object 0x%p.\n",
 763                                                 object);
 764                         dump_stack();
 765                 }
 766                 else
 767                         slab_err(s, page, "object at 0x%p belongs "
 768                                 "to slab %s", object, page->slab->name);
 769                 goto fail;
 770         }
 771         return 1;
 772 fail:
 773         printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
 774                 s->name, page, object);
 775         return 0;
 776 }
 777
 778 /*
 779  * Slab allocation and freeing
 780  */
 781 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 782 {
 783         struct page * page;
 784         int pages = 1 << s->order;
 785
 786         if (s->order)
 787                 flags |= __GFP_COMP;
 788
 789         if (s->flags & SLAB_CACHE_DMA)
 790                 flags |= SLUB_DMA;
 791
 792         if (node == -1)
 793                 page = alloc_pages(flags, s->order);
 794         else
 795                 page = alloc_pages_node(node, flags, s->order);
 796
 797         if (!page)
 798                 return NULL;
 799
 800         mod_zone_page_state(page_zone(page),
 801                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 802                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 803                 pages);
 804
 805         return page;
 806 }
 807
 808 static void setup_object(struct kmem_cache *s, struct page *page,
 809                                 void *object)
 810 {
 811         if (PageError(page)) {
 812                 init_object(s, object, 0);
 813                 init_tracking(s, object);
 814         }
 815
 816         if (unlikely(s->ctor))
 817                 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
 818 }
 819
 820 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 821 {
 822         struct page *page;
 823         struct kmem_cache_node *n;
 824         void *start;
 825         void *end;
 826         void *last;
 827         void *p;
 828
 829         BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
 830
 831         if (flags & __GFP_WAIT)
 832                 local_irq_enable();
 833
 834         page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
 835         if (!page)
 836                 goto out;
 837
 838         n = get_node(s, page_to_nid(page));
 839         if (n)
 840                 atomic_long_inc(&n->nr_slabs);
 841         page->offset = s->offset / sizeof(void *);
 842         page->slab = s;
 843         page->flags |= 1 << PG_slab;
 844         if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
 845                         SLAB_STORE_USER | SLAB_TRACE))
 846                 page->flags |= 1 << PG_error;
 847
 848         start = page_address(page);
 849         end = start + s->objects * s->size;
 850
 851         if (unlikely(s->flags & SLAB_POISON))
 852                 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
 853
 854         last = start;
 855         for (p = start + s->size; p < end; p += s->size) {
 856                 setup_object(s, page, last);
 857                 set_freepointer(s, last, p);
 858                 last = p;
 859         }
 860         setup_object(s, page, last);
 861         set_freepointer(s, last, NULL);
 862
 863         page->freelist = start;
 864         page->inuse = 0;
 865 out:
 866         if (flags & __GFP_WAIT)
 867                 local_irq_disable();
 868         return page;
 869 }
 870
 871 static void __free_slab(struct kmem_cache *s, struct page *page)
 872 {
 873         int pages = 1 << s->order;
 874
 875         if (unlikely(PageError(page) || s->dtor)) {
 876                 void *start = page_address(page);
 877                 void *end = start + (pages << PAGE_SHIFT);
 878                 void *p;
 879
 880                 slab_pad_check(s, page);
 881                 for (p = start; p <= end - s->size; p += s->size) {
 882                         if (s->dtor)
 883                                 s->dtor(p, s, 0);
 884                         check_object(s, page, p, 0);
 885                 }
 886         }
 887
 888         mod_zone_page_state(page_zone(page),
 889                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 890                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 891                 - pages);
 892
 893         page->mapping = NULL;
 894         __free_pages(page, s->order);
 895 }
 896
 897 static void rcu_free_slab(struct rcu_head *h)
 898 {
 899         struct page *page;
 900
 901         page = container_of((struct list_head *)h, struct page, lru);
 902         __free_slab(page->slab, page);
 903 }
 904
 905 static void free_slab(struct kmem_cache *s, struct page *page)
 906 {
 907         if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
 908                 /*
 909                  * RCU free overloads the RCU head over the LRU
 910                  */
 911                 struct rcu_head *head = (void *)&page->lru;
 912
 913                 call_rcu(head, rcu_free_slab);
 914         } else
 915                 __free_slab(s, page);
 916 }
 917
 918 static void discard_slab(struct kmem_cache *s, struct page *page)
 919 {
 920         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 921
 922         atomic_long_dec(&n->nr_slabs);
 923         reset_page_mapcount(page);
 924         page->flags &= ~(1 << PG_slab | 1 << PG_error);
 925         free_slab(s, page);
 926 }
 927
 928 /*
 929  * Per slab locking using the pagelock
 930  */
 931 static __always_inline void slab_lock(struct page *page)
 932 {
 933         bit_spin_lock(PG_locked, &page->flags);
 934 }
 935
 936 static __always_inline void slab_unlock(struct page *page)
 937 {
 938         bit_spin_unlock(PG_locked, &page->flags);
 939 }
 940
 941 static __always_inline int slab_trylock(struct page *page)
 942 {
 943         int rc = 1;
 944
 945         rc = bit_spin_trylock(PG_locked, &page->flags);
 946         return rc;
 947 }
 948
 949 /*
 950  * Management of partially allocated slabs
 951  */
 952 static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
 953 {
 954         spin_lock(&n->list_lock);
 955         n->nr_partial++;
 956         list_add_tail(&page->lru, &n->partial);
 957         spin_unlock(&n->list_lock);
 958 }
 959
 960 static void add_partial(struct kmem_cache_node *n, struct page *page)
 961 {
 962         spin_lock(&n->list_lock);
 963         n->nr_partial++;
 964         list_add(&page->lru, &n->partial);
 965         spin_unlock(&n->list_lock);
 966 }
 967
 968 static void remove_partial(struct kmem_cache *s,
 969                                                 struct page *page)
 970 {
 971         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 972
 973         spin_lock(&n->list_lock);
 974         list_del(&page->lru);
 975         n->nr_partial--;
 976         spin_unlock(&n->list_lock);
 977 }
 978
 979 /*
 980  * Lock slab and remove from the partial list.
 981  *
 982  * Must hold list_lock.
 983  */
 984 static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
 985 {
 986         if (slab_trylock(page)) {
 987                 list_del(&page->lru);
 988                 n->nr_partial--;
 989                 return 1;
 990         }
 991         return 0;
 992 }
 993
 994 /*
 995  * Try to allocate a partial slab from a specific node.
 996  */
 997 static struct page *get_partial_node(struct kmem_cache_node *n)
 998 {
 999         struct page *page;
1000
1001         /*
1002          * Racy check. If we mistakenly see no partial slabs then we
1003          * just allocate an empty slab. If we mistakenly try to get a
1004          * partial slab and there is none available then get_partials()
1005          * will return NULL.
1006          */
1007         if (!n || !n->nr_partial)
1008                 return NULL;
1009
1010         spin_lock(&n->list_lock);
1011         list_for_each_entry(page, &n->partial, lru)
1012                 if (lock_and_del_slab(n, page))
1013                         goto out;
1014         page = NULL;
1015 out:
1016         spin_unlock(&n->list_lock);
1017         return page;
1018 }
1019
1020 /*
1021  * Get a page from somewhere. Search in increasing NUMA distances.
1022  */
1023 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1024 {
1025 #ifdef CONFIG_NUMA
1026         struct zonelist *zonelist;
1027         struct zone **z;
1028         struct page *page;
1029
1030         /*
1031          * The defrag ratio allows a configuration of the tradeoffs between
1032          * inter node defragmentation and node local allocations. A lower
1033          * defrag_ratio increases the tendency to do local allocations
1034          * instead of attempting to obtain partial slabs from other nodes.
1035          *
1036          * If the defrag_ratio is set to 0 then kmalloc() always
1037          * returns node local objects. If the ratio is higher then kmalloc()
1038          * may return off node objects because partial slabs are obtained
1039          * from other nodes and filled up.
1040          *
1041          * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1042          * defrag_ratio = 1000) then every (well almost) allocation will
1043          * first attempt to defrag slab caches on other nodes. This means
1044          * scanning over all nodes to look for partial slabs which may be
1045          * expensive if we do it every time we are trying to find a slab
1046          * with available objects.
1047          */
1048         if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1049                 return NULL;
1050
1051         zonelist = &NODE_DATA(slab_node(current->mempolicy))
1052                                         ->node_zonelists[gfp_zone(flags)];
1053         for (z = zonelist->zones; *z; z++) {
1054                 struct kmem_cache_node *n;
1055
1056                 n = get_node(s, zone_to_nid(*z));
1057
1058                 if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
1059                                 n->nr_partial > MIN_PARTIAL) {
1060                         page = get_partial_node(n);
1061                         if (page)
1062                                 return page;
1063                 }
1064         }
1065 #endif
1066         return NULL;
1067 }
1068
1069 /*
1070  * Get a partial page, lock it and return it.
1071  */
1072 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1073 {
1074         struct page *page;
1075         int searchnode = (node == -1) ? numa_node_id() : node;
1076
1077         page = get_partial_node(get_node(s, searchnode));
1078         if (page || (flags & __GFP_THISNODE))
1079                 return page;
1080
1081         return get_any_partial(s, flags);
1082 }
1083
1084 /*
1085  * Move a page back to the lists.
1086  *
1087  * Must be called with the slab lock held.
1088  *
1089  * On exit the slab lock will have been dropped.
1090  */
1091 static void putback_slab(struct kmem_cache *s, struct page *page)
1092 {
1093         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1094
1095         if (page->inuse) {
1096
1097                 if (page->freelist)
1098                         add_partial(n, page);
1099                 else if (PageError(page) && (s->flags & SLAB_STORE_USER))
1100                         add_full(n, page);
1101                 slab_unlock(page);
1102
1103         } else {
1104                 if (n->nr_partial < MIN_PARTIAL) {
1105                         /*
1106                          * Adding an empty slab to the partial slabs in order
1107                          * to avoid page allocator overhead. This slab needs
1108                          * to come after the other slabs with objects in
1109                          * order to fill them up. That way the size of the
1110                          * partial list stays small. kmem_cache_shrink can
1111                          * reclaim empty slabs from the partial list.
1112                          */
1113                         add_partial_tail(n, page);
1114                         slab_unlock(page);
1115                 } else {
1116                         slab_unlock(page);
1117                         discard_slab(s, page);
1118                 }
1119         }
1120 }
1121
1122 /*
1123  * Remove the cpu slab
1124  */
1125 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
1126 {
1127         s->cpu_slab[cpu] = NULL;
1128         ClearPageActive(page);
1129
1130         putback_slab(s, page);
1131 }
1132
1133 static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
1134 {
1135         slab_lock(page);
1136         deactivate_slab(s, page, cpu);
1137 }
1138
1139 /*
1140  * Flush cpu slab.
1141  * Called from IPI handler with interrupts disabled.
1142  */
1143 static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1144 {
1145         struct page *page = s->cpu_slab[cpu];
1146
1147         if (likely(page))
1148                 flush_slab(s, page, cpu);
1149 }
1150
1151 static void flush_cpu_slab(void *d)
1152 {
1153         struct kmem_cache *s = d;
1154         int cpu = smp_processor_id();
1155
1156         __flush_cpu_slab(s, cpu);
1157 }
1158
1159 static void flush_all(struct kmem_cache *s)
1160 {
1161 #ifdef CONFIG_SMP
1162         on_each_cpu(flush_cpu_slab, s, 1, 1);
1163 #else
1164         unsigned long flags;
1165
1166         local_irq_save(flags);
1167         flush_cpu_slab(s);
1168         local_irq_restore(flags);
1169 #endif
1170 }
1171
1172 /*
1173  * slab_alloc is optimized to only modify two cachelines on the fast path
1174  * (aside from the stack):
1175  *
1176  * 1. The page struct
1177  * 2. The first cacheline of the object to be allocated.
1178  *
1179  * The only other cache lines that are read (apart from code) is the
1180  * per cpu array in the kmem_cache struct.
1181  *
1182  * Fastpath is not possible if we need to get a new slab or have
1183  * debugging enabled (which means all slabs are marked with PageError)
1184  */
1185 static void *slab_alloc(struct kmem_cache *s,
1186                                 gfp_t gfpflags, int node, void *addr)
1187 {
1188         struct page *page;
1189         void **object;
1190         unsigned long flags;
1191         int cpu;
1192
1193         local_irq_save(flags);
1194         cpu = smp_processor_id();
1195         page = s->cpu_slab[cpu];
1196         if (!page)
1197                 goto new_slab;
1198
1199         slab_lock(page);
1200         if (unlikely(node != -1 && page_to_nid(page) != node))
1201                 goto another_slab;
1202 redo:
1203         object = page->freelist;
1204         if (unlikely(!object))
1205                 goto another_slab;
1206         if (unlikely(PageError(page)))
1207                 goto debug;
1208
1209 have_object:
1210         page->inuse++;
1211         page->freelist = object[page->offset];
1212         slab_unlock(page);
1213         local_irq_restore(flags);
1214         return object;
1215
1216 another_slab:
1217         deactivate_slab(s, page, cpu);
1218
1219 new_slab:
1220         page = get_partial(s, gfpflags, node);
1221         if (likely(page)) {
1222 have_slab:
1223                 s->cpu_slab[cpu] = page;
1224                 SetPageActive(page);
1225                 goto redo;
1226         }
1227
1228         page = new_slab(s, gfpflags, node);
1229         if (page) {
1230                 cpu = smp_processor_id();
1231                 if (s->cpu_slab[cpu]) {
1232                         /*
1233                          * Someone else populated the cpu_slab while we
1234                          * enabled interrupts, or we have gotten scheduled
1235                          * on another cpu. The page may not be on the
1236                          * requested node even if __GFP_THISNODE was
1237                          * specified. So we need to recheck.
1238                          */
1239                         if (node == -1 ||
1240                                 page_to_nid(s->cpu_slab[cpu]) == node) {
1241                                 /*
1242                                  * Current cpuslab is acceptable and we
1243                                  * want the current one since its cache hot
1244                                  */
1245                                 discard_slab(s, page);
1246                                 page = s->cpu_slab[cpu];
1247                                 slab_lock(page);
1248                                 goto redo;
1249                         }
1250                         /* New slab does not fit our expectations */
1251                         flush_slab(s, s->cpu_slab[cpu], cpu);
1252                 }
1253                 slab_lock(page);
1254                 goto have_slab;
1255         }
1256         local_irq_restore(flags);
1257         return NULL;
1258 debug:
1259         if (!alloc_object_checks(s, page, object))
1260                 goto another_slab;
1261         if (s->flags & SLAB_STORE_USER)
1262                 set_track(s, object, TRACK_ALLOC, addr);
1263         if (s->flags & SLAB_TRACE) {
1264                 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
1265                         s->name, object, page->inuse,
1266                         page->freelist);
1267                 dump_stack();
1268         }
1269         init_object(s, object, 1);
1270         goto have_object;
1271 }
1272
1273 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1274 {
1275         return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1276 }
1277 EXPORT_SYMBOL(kmem_cache_alloc);
1278
1279 #ifdef CONFIG_NUMA
1280 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1281 {
1282         return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1283 }
1284 EXPORT_SYMBOL(kmem_cache_alloc_node);
1285 #endif
1286
1287 /*
1288  * The fastpath only writes the cacheline of the page struct and the first
1289  * cacheline of the object.
1290  *
1291  * We read the cpu_slab cacheline to check if the slab is the per cpu
1292  * slab for this processor.
1293  */
1294 static void slab_free(struct kmem_cache *s, struct page *page,
1295                                         void *x, void *addr)
1296 {
1297         void *prior;
1298         void **object = (void *)x;
1299         unsigned long flags;
1300
1301         local_irq_save(flags);
1302         slab_lock(page);
1303
1304         if (unlikely(PageError(page)))
1305                 goto debug;
1306 checks_ok:
1307         prior = object[page->offset] = page->freelist;
1308         page->freelist = object;
1309         page->inuse--;
1310
1311         if (unlikely(PageActive(page)))
1312                 /*
1313                  * Cpu slabs are never on partial lists and are
1314                  * never freed.
1315                  */
1316                 goto out_unlock;
1317
1318         if (unlikely(!page->inuse))
1319                 goto slab_empty;
1320
1321         /*
1322          * Objects left in the slab. If it
1323          * was not on the partial list before
1324          * then add it.
1325          */
1326         if (unlikely(!prior))
1327                 add_partial(get_node(s, page_to_nid(page)), page);
1328
1329 out_unlock:
1330         slab_unlock(page);
1331         local_irq_restore(flags);
1332         return;
1333
1334 slab_empty:
1335         if (prior)
1336                 /*
1337                  * Slab still on the partial list.
1338                  */
1339                 remove_partial(s, page);
1340
1341         slab_unlock(page);
1342         discard_slab(s, page);
1343         local_irq_restore(flags);
1344         return;
1345
1346 debug:
1347         if (!free_object_checks(s, page, x))
1348                 goto out_unlock;
1349         if (!PageActive(page) && !page->freelist)
1350                 remove_full(s, page);
1351         if (s->flags & SLAB_STORE_USER)
1352                 set_track(s, x, TRACK_FREE, addr);
1353         if (s->flags & SLAB_TRACE) {
1354                 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1355                         s->name, object, page->inuse,
1356                         page->freelist);
1357                 print_section("Object", (void *)object, s->objsize);
1358                 dump_stack();
1359         }
1360         init_object(s, object, 0);
1361         goto checks_ok;
1362 }
1363
1364 void kmem_cache_free(struct kmem_cache *s, void *x)
1365 {
1366         struct page *page;
1367
1368         page = virt_to_head_page(x);
1369
1370         slab_free(s, page, x, __builtin_return_address(0));
1371 }
1372 EXPORT_SYMBOL(kmem_cache_free);
1373
1374 /* Figure out on which slab object the object resides */
1375 static struct page *get_object_page(const void *x)
1376 {
1377         struct page *page = virt_to_head_page(x);
1378
1379         if (!PageSlab(page))
1380                 return NULL;
1381
1382         return page;
1383 }
1384
1385 /*
1386  * Object placement in a slab is made very easy because we always start at
1387  * offset 0. If we tune the size of the object to the alignment then we can
1388  * get the required alignment by putting one properly sized object after
1389  * another.
1390  *
1391  * Notice that the allocation order determines the sizes of the per cpu
1392  * caches. Each processor has always one slab available for allocations.
1393  * Increasing the allocation order reduces the number of times that slabs
1394  * must be moved on and off the partial lists and is therefore a factor in
1395  * locking overhead.
1396  */
1397
1398 /*
1399  * Mininum / Maximum order of slab pages. This influences locking overhead
1400  * and slab fragmentation. A higher order reduces the number of partial slabs
1401  * and increases the number of allocations possible without having to
1402  * take the list_lock.
1403  */
1404 static int slub_min_order;
1405 static int slub_max_order = DEFAULT_MAX_ORDER;
1406 static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1407
1408 /*
1409  * Merge control. If this is set then no merging of slab caches will occur.
1410  * (Could be removed. This was introduced to pacify the merge skeptics.)
1411  */
1412 static int slub_nomerge;
1413
1414 /*
1415  * Debug settings:
1416  */
1417 static int slub_debug;
1418
1419 static char *slub_debug_slabs;
1420
1421 /*
1422  * Calculate the order of allocation given an slab object size.
1423  *
1424  * The order of allocation has significant impact on performance and other
1425  * system components. Generally order 0 allocations should be preferred since
1426  * order 0 does not cause fragmentation in the page allocator. Larger objects
1427  * be problematic to put into order 0 slabs because there may be too much
1428  * unused space left. We go to a higher order if more than 1/8th of the slab
1429  * would be wasted.
1430  *
1431  * In order to reach satisfactory performance we must ensure that a minimum
1432  * number of objects is in one slab. Otherwise we may generate too much
1433  * activity on the partial lists which requires taking the list_lock. This is
1434  * less a concern for large slabs though which are rarely used.
1435  *
1436  * slub_max_order specifies the order where we begin to stop considering the
1437  * number of objects in a slab as critical. If we reach slub_max_order then
1438  * we try to keep the page order as low as possible. So we accept more waste
1439  * of space in favor of a small page order.
1440  *
1441  * Higher order allocations also allow the placement of more objects in a
1442  * slab and thereby reduce object handling overhead. If the user has
1443  * requested a higher mininum order then we start with that one instead of
1444  * the smallest order which will fit the object.
1445  */
1446 static int calculate_order(int size)
1447 {
1448         int order;
1449         int rem;
1450
1451         for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
1452                         order < MAX_ORDER; order++) {
1453                 unsigned long slab_size = PAGE_SIZE << order;
1454
1455                 if (slub_max_order > order &&
1456                                 slab_size < slub_min_objects * size)
1457                         continue;
1458
1459                 if (slab_size < size)
1460                         continue;
1461
1462                 rem = slab_size % size;
1463
1464                 if (rem <= slab_size / 8)
1465                         break;
1466
1467         }
1468         if (order >= MAX_ORDER)
1469                 return -E2BIG;
1470
1471         return order;
1472 }
1473
1474 /*
1475  * Figure out what the alignment of the objects will be.
1476  */
1477 static unsigned long calculate_alignment(unsigned long flags,
1478                 unsigned long align, unsigned long size)
1479 {
1480         /*
1481          * If the user wants hardware cache aligned objects then
1482          * follow that suggestion if the object is sufficiently
1483          * large.
1484          *
1485          * The hardware cache alignment cannot override the
1486          * specified alignment though. If that is greater
1487          * then use it.
1488          */
1489         if ((flags & SLAB_HWCACHE_ALIGN) &&
1490                         size > cache_line_size() / 2)
1491                 return max_t(unsigned long, align, cache_line_size());
1492
1493         if (align < ARCH_SLAB_MINALIGN)
1494                 return ARCH_SLAB_MINALIGN;
1495
1496         return ALIGN(align, sizeof(void *));
1497 }
1498
1499 static void init_kmem_cache_node(struct kmem_cache_node *n)
1500 {
1501         n->nr_partial = 0;
1502         atomic_long_set(&n->nr_slabs, 0);
1503         spin_lock_init(&n->list_lock);
1504         INIT_LIST_HEAD(&n->partial);
1505         INIT_LIST_HEAD(&n->full);
1506 }
1507
1508 #ifdef CONFIG_NUMA
1509 /*
1510  * No kmalloc_node yet so do it by hand. We know that this is the first
1511  * slab on the node for this slabcache. There are no concurrent accesses
1512  * possible.
1513  *
1514  * Note that this function only works on the kmalloc_node_cache
1515  * when allocating for the kmalloc_node_cache.
1516  */
1517 static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
1518                                                                 int node)
1519 {
1520         struct page *page;
1521         struct kmem_cache_node *n;
1522
1523         BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
1524
1525         page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
1526         /* new_slab() disables interupts */
1527         local_irq_enable();
1528
1529         BUG_ON(!page);
1530         n = page->freelist;
1531         BUG_ON(!n);
1532         page->freelist = get_freepointer(kmalloc_caches, n);
1533         page->inuse++;
1534         kmalloc_caches->node[node] = n;
1535         init_object(kmalloc_caches, n, 1);
1536         init_kmem_cache_node(n);
1537         atomic_long_inc(&n->nr_slabs);
1538         add_partial(n, page);
1539         return n;
1540 }
1541
1542 static void free_kmem_cache_nodes(struct kmem_cache *s)
1543 {
1544         int node;
1545
1546         for_each_online_node(node) {
1547                 struct kmem_cache_node *n = s->node[node];
1548                 if (n && n != &s->local_node)
1549                         kmem_cache_free(kmalloc_caches, n);
1550                 s->node[node] = NULL;
1551         }
1552 }
1553
1554 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1555 {
1556         int node;
1557         int local_node;
1558
1559         if (slab_state >= UP)
1560                 local_node = page_to_nid(virt_to_page(s));
1561         else
1562                 local_node = 0;
1563
1564         for_each_online_node(node) {
1565                 struct kmem_cache_node *n;
1566
1567                 if (local_node == node)
1568                         n = &s->local_node;
1569                 else {
1570                         if (slab_state == DOWN) {
1571                                 n = early_kmem_cache_node_alloc(gfpflags,
1572                                                                 node);
1573                                 continue;
1574                         }
1575                         n = kmem_cache_alloc_node(kmalloc_caches,
1576                                                         gfpflags, node);
1577
1578                         if (!n) {
1579                                 free_kmem_cache_nodes(s);
1580                                 return 0;
1581                         }
1582
1583                 }
1584                 s->node[node] = n;
1585                 init_kmem_cache_node(n);
1586         }
1587         return 1;
1588 }
1589 #else
1590 static void free_kmem_cache_nodes(struct kmem_cache *s)
1591 {
1592 }
1593
1594 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1595 {
1596         init_kmem_cache_node(&s->local_node);
1597         return 1;
1598 }
1599 #endif
1600
1601 /*
1602  * calculate_sizes() determines the order and the distribution of data within
1603  * a slab object.
1604  */
1605 static int calculate_sizes(struct kmem_cache *s)
1606 {
1607         unsigned long flags = s->flags;
1608         unsigned long size = s->objsize;
1609         unsigned long align = s->align;
1610
1611         /*
1612          * Determine if we can poison the object itself. If the user of
1613          * the slab may touch the object after free or before allocation
1614          * then we should never poison the object itself.
1615          */
1616         if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
1617                         !s->ctor && !s->dtor)
1618                 s->flags |= __OBJECT_POISON;
1619         else
1620                 s->flags &= ~__OBJECT_POISON;
1621
1622         /*
1623          * Round up object size to the next word boundary. We can only
1624          * place the free pointer at word boundaries and this determines
1625          * the possible location of the free pointer.
1626          */
1627         size = ALIGN(size, sizeof(void *));
1628
1629         /*
1630          * If we are Redzoning then check if there is some space between the
1631          * end of the object and the free pointer. If not then add an
1632          * additional word to have some bytes to store Redzone information.
1633          */
1634         if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1635                 size += sizeof(void *);
1636
1637         /*
1638          * With that we have determined the number of bytes in actual use
1639          * by the object. This is the potential offset to the free pointer.
1640          */
1641         s->inuse = size;
1642
1643         if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1644                 s->ctor || s->dtor)) {
1645                 /*
1646                  * Relocate free pointer after the object if it is not
1647                  * permitted to overwrite the first word of the object on
1648                  * kmem_cache_free.
1649                  *
1650                  * This is the case if we do RCU, have a constructor or
1651                  * destructor or are poisoning the objects.
1652                  */
1653                 s->offset = size;
1654                 size += sizeof(void *);
1655         }
1656
1657         if (flags & SLAB_STORE_USER)
1658                 /*
1659                  * Need to store information about allocs and frees after
1660                  * the object.
1661                  */
1662                 size += 2 * sizeof(struct track);
1663
1664         if (flags & SLAB_RED_ZONE)
1665                 /*
1666                  * Add some empty padding so that we can catch
1667                  * overwrites from earlier objects rather than let
1668                  * tracking information or the free pointer be
1669                  * corrupted if an user writes before the start
1670                  * of the object.
1671                  */
1672                 size += sizeof(void *);
1673
1674         /*
1675          * Determine the alignment based on various parameters that the
1676          * user specified and the dynamic determination of cache line size
1677          * on bootup.
1678          */
1679         align = calculate_alignment(flags, align, s->objsize);
1680
1681         /*
1682          * SLUB stores one object immediately after another beginning from
1683          * offset 0. In order to align the objects we have to simply size
1684          * each object to conform to the alignment.
1685          */
1686         size = ALIGN(size, align);
1687         s->size = size;
1688
1689         s->order = calculate_order(size);
1690         if (s->order < 0)
1691                 return 0;
1692
1693         /*
1694          * Determine the number of objects per slab
1695          */
1696         s->objects = (PAGE_SIZE << s->order) / size;
1697
1698         /*
1699          * Verify that the number of objects is within permitted limits.
1700          * The page->inuse field is only 16 bit wide! So we cannot have
1701          * more than 64k objects per slab.
1702          */
1703         if (!s->objects || s->objects > 65535)
1704                 return 0;
1705         return 1;
1706
1707 }
1708
1709 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1710                 const char *name, size_t size,
1711                 size_t align, unsigned long flags,
1712                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
1713                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
1714 {
1715         memset(s, 0, kmem_size);
1716         s->name = name;
1717         s->ctor = ctor;
1718         s->dtor = dtor;
1719         s->objsize = size;
1720         s->flags = flags;
1721         s->align = align;
1722
1723         /*
1724          * The page->offset field is only 16 bit wide. This is an offset
1725          * in units of words from the beginning of an object. If the slab
1726          * size is bigger then we cannot move the free pointer behind the
1727          * object anymore.
1728          *
1729          * On 32 bit platforms the limit is 256k. On 64bit platforms
1730          * the limit is 512k.
1731          *
1732          * Debugging or ctor/dtors may create a need to move the free
1733          * pointer. Fail if this happens.
1734          */
1735         if (s->size >= 65535 * sizeof(void *)) {
1736                 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1737                                 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1738                 BUG_ON(ctor || dtor);
1739         }
1740         else
1741                 /*
1742                  * Enable debugging if selected on the kernel commandline.
1743                  */
1744                 if (slub_debug && (!slub_debug_slabs ||
1745                     strncmp(slub_debug_slabs, name,
1746                         strlen(slub_debug_slabs)) == 0))
1747                                 s->flags |= slub_debug;
1748
1749         if (!calculate_sizes(s))
1750                 goto error;
1751
1752         s->refcount = 1;
1753 #ifdef CONFIG_NUMA
1754         s->defrag_ratio = 100;
1755 #endif
1756
1757         if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
1758                 return 1;
1759 error:
1760         if (flags & SLAB_PANIC)
1761                 panic("Cannot create slab %s size=%lu realsize=%u "
1762                         "order=%u offset=%u flags=%lx\n",
1763                         s->name, (unsigned long)size, s->size, s->order,
1764                         s->offset, flags);
1765         return 0;
1766 }
1767 EXPORT_SYMBOL(kmem_cache_open);
1768
1769 /*
1770  * Check if a given pointer is valid
1771  */
1772 int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1773 {
1774         struct page * page;
1775
1776         page = get_object_page(object);
1777
1778         if (!page || s != page->slab)
1779                 /* No slab or wrong slab */
1780                 return 0;
1781
1782         if (!check_valid_pointer(s, page, object))
1783                 return 0;
1784
1785         /*
1786          * We could also check if the object is on the slabs freelist.
1787          * But this would be too expensive and it seems that the main
1788          * purpose of kmem_ptr_valid is to check if the object belongs
1789          * to a certain slab.
1790          */
1791         return 1;
1792 }
1793 EXPORT_SYMBOL(kmem_ptr_validate);
1794
1795 /*
1796  * Determine the size of a slab object
1797  */
1798 unsigned int kmem_cache_size(struct kmem_cache *s)
1799 {
1800         return s->objsize;
1801 }
1802 EXPORT_SYMBOL(kmem_cache_size);
1803
1804 const char *kmem_cache_name(struct kmem_cache *s)
1805 {
1806         return s->name;
1807 }
1808 EXPORT_SYMBOL(kmem_cache_name);
1809
1810 /*
1811  * Attempt to free all slabs on a node. Return the number of slabs we
1812  * were unable to free.
1813  */
1814 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1815                         struct list_head *list)
1816 {
1817         int slabs_inuse = 0;
1818         unsigned long flags;
1819         struct page *page, *h;
1820
1821         spin_lock_irqsave(&n->list_lock, flags);
1822         list_for_each_entry_safe(page, h, list, lru)
1823                 if (!page->inuse) {
1824                         list_del(&page->lru);
1825                         discard_slab(s, page);
1826                 } else
1827                         slabs_inuse++;
1828         spin_unlock_irqrestore(&n->list_lock, flags);
1829         return slabs_inuse;
1830 }
1831
1832 /*
1833  * Release all resources used by a slab cache.
1834  */
1835 static int kmem_cache_close(struct kmem_cache *s)
1836 {
1837         int node;
1838
1839         flush_all(s);
1840
1841         /* Attempt to free all objects */
1842         for_each_online_node(node) {
1843                 struct kmem_cache_node *n = get_node(s, node);
1844
1845                 n->nr_partial -= free_list(s, n, &n->partial);
1846                 if (atomic_long_read(&n->nr_slabs))
1847                         return 1;
1848         }
1849         free_kmem_cache_nodes(s);
1850         return 0;
1851 }
1852
1853 /*
1854  * Close a cache and release the kmem_cache structure
1855  * (must be used for caches created using kmem_cache_create)
1856  */
1857 void kmem_cache_destroy(struct kmem_cache *s)
1858 {
1859         down_write(&slub_lock);
1860         s->refcount--;
1861         if (!s->refcount) {
1862                 list_del(&s->list);
1863                 if (kmem_cache_close(s))
1864                         WARN_ON(1);
1865                 sysfs_slab_remove(s);
1866                 kfree(s);
1867         }
1868         up_write(&slub_lock);
1869 }
1870 EXPORT_SYMBOL(kmem_cache_destroy);
1871
1872 /********************************************************************
1873  *              Kmalloc subsystem
1874  *******************************************************************/
1875
1876 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
1877 EXPORT_SYMBOL(kmalloc_caches);
1878
1879 #ifdef CONFIG_ZONE_DMA
1880 static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
1881 #endif
1882
1883 static int __init setup_slub_min_order(char *str)
1884 {
1885         get_option (&str, &slub_min_order);
1886
1887         return 1;
1888 }
1889
1890 __setup("slub_min_order=", setup_slub_min_order);
1891
1892 static int __init setup_slub_max_order(char *str)
1893 {
1894         get_option (&str, &slub_max_order);
1895
1896         return 1;
1897 }
1898
1899 __setup("slub_max_order=", setup_slub_max_order);
1900
1901 static int __init setup_slub_min_objects(char *str)
1902 {
1903         get_option (&str, &slub_min_objects);
1904
1905         return 1;
1906 }
1907
1908 __setup("slub_min_objects=", setup_slub_min_objects);
1909
1910 static int __init setup_slub_nomerge(char *str)
1911 {
1912         slub_nomerge = 1;
1913         return 1;
1914 }
1915
1916 __setup("slub_nomerge", setup_slub_nomerge);
1917
1918 static int __init setup_slub_debug(char *str)
1919 {
1920         if (!str || *str != '=')
1921                 slub_debug = DEBUG_DEFAULT_FLAGS;
1922         else {
1923                 str++;
1924                 if (*str == 0 || *str == ',')
1925                         slub_debug = DEBUG_DEFAULT_FLAGS;
1926                 else
1927                 for( ;*str && *str != ','; str++)
1928                         switch (*str) {
1929                         case 'f' : case 'F' :
1930                                 slub_debug |= SLAB_DEBUG_FREE;
1931                                 break;
1932                         case 'z' : case 'Z' :
1933                                 slub_debug |= SLAB_RED_ZONE;
1934                                 break;
1935                         case 'p' : case 'P' :
1936                                 slub_debug |= SLAB_POISON;
1937                                 break;
1938                         case 'u' : case 'U' :
1939                                 slub_debug |= SLAB_STORE_USER;
1940                                 break;
1941                         case 't' : case 'T' :
1942                                 slub_debug |= SLAB_TRACE;
1943                                 break;
1944                         default:
1945                                 printk(KERN_ERR "slub_debug option '%c' "
1946                                         "unknown. skipped\n",*str);
1947                         }
1948         }
1949
1950         if (*str == ',')
1951                 slub_debug_slabs = str + 1;
1952         return 1;
1953 }
1954
1955 __setup("slub_debug", setup_slub_debug);
1956
1957 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1958                 const char *name, int size, gfp_t gfp_flags)
1959 {
1960         unsigned int flags = 0;
1961
1962         if (gfp_flags & SLUB_DMA)
1963                 flags = SLAB_CACHE_DMA;
1964
1965         down_write(&slub_lock);
1966         if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
1967                         flags, NULL, NULL))
1968                 goto panic;
1969
1970         list_add(&s->list, &slab_caches);
1971         up_write(&slub_lock);
1972         if (sysfs_slab_add(s))
1973                 goto panic;
1974         return s;
1975
1976 panic:
1977         panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
1978 }
1979
1980 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
1981 {
1982         int index = kmalloc_index(size);
1983
1984         if (!index)
1985                 return NULL;
1986
1987         /* Allocation too large? */
1988         BUG_ON(index < 0);
1989
1990 #ifdef CONFIG_ZONE_DMA
1991         if ((flags & SLUB_DMA)) {
1992                 struct kmem_cache *s;
1993                 struct kmem_cache *x;
1994                 char *text;
1995                 size_t realsize;
1996
1997                 s = kmalloc_caches_dma[index];
1998                 if (s)
1999                         return s;
2000
2001                 /* Dynamically create dma cache */
2002                 x = kmalloc(kmem_size, flags & ~SLUB_DMA);
2003                 if (!x)
2004                         panic("Unable to allocate memory for dma cache\n");
2005
2006                 if (index <= KMALLOC_SHIFT_HIGH)
2007                         realsize = 1 << index;
2008                 else {
2009                         if (index == 1)
2010                                 realsize = 96;
2011                         else
2012                                 realsize = 192;
2013                 }
2014
2015                 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2016                                 (unsigned int)realsize);
2017                 s = create_kmalloc_cache(x, text, realsize, flags);
2018                 kmalloc_caches_dma[index] = s;
2019                 return s;
2020         }
2021 #endif
2022         return &kmalloc_caches[index];
2023 }
2024
2025 void *__kmalloc(size_t size, gfp_t flags)
2026 {
2027         struct kmem_cache *s = get_slab(size, flags);
2028
2029         if (s)
2030                 return slab_alloc(s, flags, -1, __builtin_return_address(0));
2031         return NULL;
2032 }
2033 EXPORT_SYMBOL(__kmalloc);
2034
2035 #ifdef CONFIG_NUMA
2036 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2037 {
2038         struct kmem_cache *s = get_slab(size, flags);
2039
2040         if (s)
2041                 return slab_alloc(s, flags, node, __builtin_return_address(0));
2042         return NULL;
2043 }
2044 EXPORT_SYMBOL(__kmalloc_node);
2045 #endif
2046
2047 size_t ksize(const void *object)
2048 {
2049         struct page *page = get_object_page(object);
2050         struct kmem_cache *s;
2051
2052         BUG_ON(!page);
2053         s = page->slab;
2054         BUG_ON(!s);
2055
2056         /*
2057          * Debugging requires use of the padding between object
2058          * and whatever may come after it.
2059          */
2060         if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2061                 return s->objsize;
2062
2063         /*
2064          * If we have the need to store the freelist pointer
2065          * back there or track user information then we can
2066          * only use the space before that information.
2067          */
2068         if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2069                 return s->inuse;
2070
2071         /*
2072          * Else we can use all the padding etc for the allocation
2073          */
2074         return s->size;
2075 }
2076 EXPORT_SYMBOL(ksize);
2077
2078 void kfree(const void *x)
2079 {
2080         struct kmem_cache *s;
2081         struct page *page;
2082
2083         if (!x)
2084                 return;
2085
2086         page = virt_to_head_page(x);
2087         s = page->slab;
2088
2089         slab_free(s, page, (void *)x, __builtin_return_address(0));
2090 }
2091 EXPORT_SYMBOL(kfree);
2092
2093 /*
2094  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2095  * the remaining slabs by the number of items in use. The slabs with the
2096  * most items in use come first. New allocations will then fill those up
2097  * and thus they can be removed from the partial lists.
2098  *
2099  * The slabs with the least items are placed last. This results in them
2100  * being allocated from last increasing the chance that the last objects
2101  * are freed in them.
2102  */
2103 int kmem_cache_shrink(struct kmem_cache *s)
2104 {
2105         int node;
2106         int i;
2107         struct kmem_cache_node *n;
2108         struct page *page;
2109         struct page *t;
2110         struct list_head *slabs_by_inuse =
2111                 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
2112         unsigned long flags;
2113
2114         if (!slabs_by_inuse)
2115                 return -ENOMEM;
2116
2117         flush_all(s);
2118         for_each_online_node(node) {
2119                 n = get_node(s, node);
2120
2121                 if (!n->nr_partial)
2122                         continue;
2123
2124                 for (i = 0; i < s->objects; i++)
2125                         INIT_LIST_HEAD(slabs_by_inuse + i);
2126
2127                 spin_lock_irqsave(&n->list_lock, flags);
2128
2129                 /*
2130                  * Build lists indexed by the items in use in each slab.
2131                  *
2132                  * Note that concurrent frees may occur while we hold the
2133                  * list_lock. page->inuse here is the upper limit.
2134                  */
2135                 list_for_each_entry_safe(page, t, &n->partial, lru) {
2136                         if (!page->inuse && slab_trylock(page)) {
2137                                 /*
2138                                  * Must hold slab lock here because slab_free
2139                                  * may have freed the last object and be
2140                                  * waiting to release the slab.
2141                                  */
2142                                 list_del(&page->lru);
2143                                 n->nr_partial--;
2144                                 slab_unlock(page);
2145                                 discard_slab(s, page);
2146                         } else {
2147                                 if (n->nr_partial > MAX_PARTIAL)
2148                                         list_move(&page->lru,
2149                                         slabs_by_inuse + page->inuse);
2150                         }
2151                 }
2152
2153                 if (n->nr_partial <= MAX_PARTIAL)
2154                         goto out;
2155
2156                 /*
2157                  * Rebuild the partial list with the slabs filled up most
2158                  * first and the least used slabs at the end.
2159                  */
2160                 for (i = s->objects - 1; i >= 0; i--)
2161                         list_splice(slabs_by_inuse + i, n->partial.prev);
2162
2163         out:
2164                 spin_unlock_irqrestore(&n->list_lock, flags);
2165         }
2166
2167         kfree(slabs_by_inuse);
2168         return 0;
2169 }
2170 EXPORT_SYMBOL(kmem_cache_shrink);
2171
2172 /**
2173  * krealloc - reallocate memory. The contents will remain unchanged.
2174  *
2175  * @p: object to reallocate memory for.
2176  * @new_size: how many bytes of memory are required.
2177  * @flags: the type of memory to allocate.
2178  *
2179  * The contents of the object pointed to are preserved up to the
2180  * lesser of the new and old sizes.  If @p is %NULL, krealloc()
2181  * behaves exactly like kmalloc().  If @size is 0 and @p is not a
2182  * %NULL pointer, the object pointed to is freed.
2183  */
2184 void *krealloc(const void *p, size_t new_size, gfp_t flags)
2185 {
2186         void *ret;
2187         size_t ks;
2188
2189         if (unlikely(!p))
2190                 return kmalloc(new_size, flags);
2191
2192         if (unlikely(!new_size)) {
2193                 kfree(p);
2194                 return NULL;
2195         }
2196
2197         ks = ksize(p);
2198         if (ks >= new_size)
2199                 return (void *)p;
2200
2201         ret = kmalloc(new_size, flags);
2202         if (ret) {
2203                 memcpy(ret, p, min(new_size, ks));
2204                 kfree(p);
2205         }
2206         return ret;
2207 }
2208 EXPORT_SYMBOL(krealloc);
2209
2210 /********************************************************************
2211  *                      Basic setup of slabs
2212  *******************************************************************/
2213
2214 void __init kmem_cache_init(void)
2215 {
2216         int i;
2217
2218 #ifdef CONFIG_NUMA
2219         /*
2220          * Must first have the slab cache available for the allocations of the
2221          * struct kmem_cache_node's. There is special bootstrap code in
2222          * kmem_cache_open for slab_state == DOWN.
2223          */
2224         create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2225                 sizeof(struct kmem_cache_node), GFP_KERNEL);
2226 #endif
2227
2228         /* Able to allocate the per node structures */
2229         slab_state = PARTIAL;
2230
2231         /* Caches that are not of the two-to-the-power-of size */
2232         create_kmalloc_cache(&kmalloc_caches[1],
2233                                 "kmalloc-96", 96, GFP_KERNEL);
2234         create_kmalloc_cache(&kmalloc_caches[2],
2235                                 "kmalloc-192", 192, GFP_KERNEL);
2236
2237         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2238                 create_kmalloc_cache(&kmalloc_caches[i],
2239                         "kmalloc", 1 << i, GFP_KERNEL);
2240
2241         slab_state = UP;
2242
2243         /* Provide the correct kmalloc names now that the caches are up */
2244         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2245                 kmalloc_caches[i]. name =
2246                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2247
2248 #ifdef CONFIG_SMP
2249         register_cpu_notifier(&slab_notifier);
2250 #endif
2251
2252         if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
2253                 kmem_size = offsetof(struct kmem_cache, cpu_slab)
2254                          + nr_cpu_ids * sizeof(struct page *);
2255
2256         printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2257                 " Processors=%d, Nodes=%d\n",
2258                 KMALLOC_SHIFT_HIGH, cache_line_size(),
2259                 slub_min_order, slub_max_order, slub_min_objects,
2260                 nr_cpu_ids, nr_node_ids);
2261 }
2262
2263 /*
2264  * Find a mergeable slab cache
2265  */
2266 static int slab_unmergeable(struct kmem_cache *s)
2267 {
2268         if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2269                 return 1;
2270
2271         if (s->ctor || s->dtor)
2272                 return 1;
2273
2274         return 0;
2275 }
2276
2277 static struct kmem_cache *find_mergeable(size_t size,
2278                 size_t align, unsigned long flags,
2279                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2280                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2281 {
2282         struct list_head *h;
2283
2284         if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2285                 return NULL;
2286
2287         if (ctor || dtor)
2288                 return NULL;
2289
2290         size = ALIGN(size, sizeof(void *));
2291         align = calculate_alignment(flags, align, size);
2292         size = ALIGN(size, align);
2293
2294         list_for_each(h, &slab_caches) {
2295                 struct kmem_cache *s =
2296                         container_of(h, struct kmem_cache, list);
2297
2298                 if (slab_unmergeable(s))
2299                         continue;
2300
2301                 if (size > s->size)
2302                         continue;
2303
2304                 if (((flags | slub_debug) & SLUB_MERGE_SAME) !=
2305                         (s->flags & SLUB_MERGE_SAME))
2306                                 continue;
2307                 /*
2308                  * Check if alignment is compatible.
2309                  * Courtesy of Adrian Drzewiecki
2310                  */
2311                 if ((s->size & ~(align -1)) != s->size)
2312                         continue;
2313
2314                 if (s->size - size >= sizeof(void *))
2315                         continue;
2316
2317                 return s;
2318         }
2319         return NULL;
2320 }
2321
2322 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2323                 size_t align, unsigned long flags,
2324                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2325                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2326 {
2327         struct kmem_cache *s;
2328
2329         down_write(&slub_lock);
2330         s = find_mergeable(size, align, flags, dtor, ctor);
2331         if (s) {
2332                 s->refcount++;
2333                 /*
2334                  * Adjust the object sizes so that we clear
2335                  * the complete object on kzalloc.
2336                  */
2337                 s->objsize = max(s->objsize, (int)size);
2338                 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2339                 if (sysfs_slab_alias(s, name))
2340                         goto err;
2341         } else {
2342                 s = kmalloc(kmem_size, GFP_KERNEL);
2343                 if (s && kmem_cache_open(s, GFP_KERNEL, name,
2344                                 size, align, flags, ctor, dtor)) {
2345                         if (sysfs_slab_add(s)) {
2346                                 kfree(s);
2347                                 goto err;
2348                         }
2349                         list_add(&s->list, &slab_caches);
2350                 } else
2351                         kfree(s);
2352         }
2353         up_write(&slub_lock);
2354         return s;
2355
2356 err:
2357         up_write(&slub_lock);
2358         if (flags & SLAB_PANIC)
2359                 panic("Cannot create slabcache %s\n", name);
2360         else
2361                 s = NULL;
2362         return s;
2363 }
2364 EXPORT_SYMBOL(kmem_cache_create);
2365
2366 void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
2367 {
2368         void *x;
2369
2370         x = slab_alloc(s, flags, -1, __builtin_return_address(0));
2371         if (x)
2372                 memset(x, 0, s->objsize);
2373         return x;
2374 }
2375 EXPORT_SYMBOL(kmem_cache_zalloc);
2376
2377 #ifdef CONFIG_SMP
2378 static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2379 {
2380         struct list_head *h;
2381
2382         down_read(&slub_lock);
2383         list_for_each(h, &slab_caches) {
2384                 struct kmem_cache *s =
2385                         container_of(h, struct kmem_cache, list);
2386
2387                 func(s, cpu);
2388         }
2389         up_read(&slub_lock);
2390 }
2391
2392 /*
2393  * Use the cpu notifier to insure that the cpu slabs are flushed when
2394  * necessary.
2395  */
2396 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2397                 unsigned long action, void *hcpu)
2398 {
2399         long cpu = (long)hcpu;
2400
2401         switch (action) {
2402         case CPU_UP_CANCELED:
2403         case CPU_DEAD:
2404                 for_all_slabs(__flush_cpu_slab, cpu);
2405                 break;
2406         default:
2407                 break;
2408         }
2409         return NOTIFY_OK;
2410 }
2411
2412 static struct notifier_block __cpuinitdata slab_notifier =
2413         { &slab_cpuup_callback, NULL, 0 };
2414
2415 #endif
2416
2417 #ifdef CONFIG_NUMA
2418
2419 /*****************************************************************
2420  * Generic reaper used to support the page allocator
2421  * (the cpu slabs are reaped by a per slab workqueue).
2422  *
2423  * Maybe move this to the page allocator?
2424  ****************************************************************/
2425
2426 static DEFINE_PER_CPU(unsigned long, reap_node);
2427
2428 static void init_reap_node(int cpu)
2429 {
2430         int node;
2431
2432         node = next_node(cpu_to_node(cpu), node_online_map);
2433         if (node == MAX_NUMNODES)
2434                 node = first_node(node_online_map);
2435
2436         __get_cpu_var(reap_node) = node;
2437 }
2438
2439 static void next_reap_node(void)
2440 {
2441         int node = __get_cpu_var(reap_node);
2442
2443         /*
2444          * Also drain per cpu pages on remote zones
2445          */
2446         if (node != numa_node_id())
2447                 drain_node_pages(node);
2448
2449         node = next_node(node, node_online_map);
2450         if (unlikely(node >= MAX_NUMNODES))
2451                 node = first_node(node_online_map);
2452         __get_cpu_var(reap_node) = node;
2453 }
2454 #else
2455 #define init_reap_node(cpu) do { } while (0)
2456 #define next_reap_node(void) do { } while (0)
2457 #endif
2458
2459 #define REAPTIMEOUT_CPUC        (2*HZ)
2460
2461 #ifdef CONFIG_SMP
2462 static DEFINE_PER_CPU(struct delayed_work, reap_work);
2463
2464 static void cache_reap(struct work_struct *unused)
2465 {
2466         next_reap_node();
2467         refresh_cpu_vm_stats(smp_processor_id());
2468         schedule_delayed_work(&__get_cpu_var(reap_work),
2469                                       REAPTIMEOUT_CPUC);
2470 }
2471
2472 static void __devinit start_cpu_timer(int cpu)
2473 {
2474         struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2475
2476         /*
2477          * When this gets called from do_initcalls via cpucache_init(),
2478          * init_workqueues() has already run, so keventd will be setup
2479          * at that time.
2480          */
2481         if (keventd_up() && reap_work->work.func == NULL) {
2482                 init_reap_node(cpu);
2483                 INIT_DELAYED_WORK(reap_work, cache_reap);
2484                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2485         }
2486 }
2487
2488 static int __init cpucache_init(void)
2489 {
2490         int cpu;
2491
2492         /*
2493          * Register the timers that drain pcp pages and update vm statistics
2494          */
2495         for_each_online_cpu(cpu)
2496                 start_cpu_timer(cpu);
2497         return 0;
2498 }
2499 __initcall(cpucache_init);
2500 #endif
2501
2502 #ifdef SLUB_RESILIENCY_TEST
2503 static unsigned long validate_slab_cache(struct kmem_cache *s);
2504
2505 static void resiliency_test(void)
2506 {
2507         u8 *p;
2508
2509         printk(KERN_ERR "SLUB resiliency testing\n");
2510         printk(KERN_ERR "-----------------------\n");
2511         printk(KERN_ERR "A. Corruption after allocation\n");
2512
2513         p = kzalloc(16, GFP_KERNEL);
2514         p[16] = 0x12;
2515         printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2516                         " 0x12->0x%p\n\n", p + 16);
2517
2518         validate_slab_cache(kmalloc_caches + 4);
2519
2520         /* Hmmm... The next two are dangerous */
2521         p = kzalloc(32, GFP_KERNEL);
2522         p[32 + sizeof(void *)] = 0x34;
2523         printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2524                         " 0x34 -> -0x%p\n", p);
2525         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2526
2527         validate_slab_cache(kmalloc_caches + 5);
2528         p = kzalloc(64, GFP_KERNEL);
2529         p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2530         *p = 0x56;
2531         printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2532                                                                         p);
2533         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2534         validate_slab_cache(kmalloc_caches + 6);
2535
2536         printk(KERN_ERR "\nB. Corruption after free\n");
2537         p = kzalloc(128, GFP_KERNEL);
2538         kfree(p);
2539         *p = 0x78;
2540         printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2541         validate_slab_cache(kmalloc_caches + 7);
2542
2543         p = kzalloc(256, GFP_KERNEL);
2544         kfree(p);
2545         p[50] = 0x9a;
2546         printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2547         validate_slab_cache(kmalloc_caches + 8);
2548
2549         p = kzalloc(512, GFP_KERNEL);
2550         kfree(p);
2551         p[512] = 0xab;
2552         printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2553         validate_slab_cache(kmalloc_caches + 9);
2554 }
2555 #else
2556 static void resiliency_test(void) {};
2557 #endif
2558
2559 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2560 {
2561         struct kmem_cache *s = get_slab(size, gfpflags);
2562
2563         if (!s)
2564                 return NULL;
2565
2566         return slab_alloc(s, gfpflags, -1, caller);
2567 }
2568
2569 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2570                                         int node, void *caller)
2571 {
2572         struct kmem_cache *s = get_slab(size, gfpflags);
2573
2574         if (!s)
2575                 return NULL;
2576
2577         return slab_alloc(s, gfpflags, node, caller);
2578 }
2579
2580 #ifdef CONFIG_SYSFS
2581
2582 static int validate_slab(struct kmem_cache *s, struct page *page)
2583 {
2584         void *p;
2585         void *addr = page_address(page);
2586         unsigned long map[BITS_TO_LONGS(s->objects)];
2587
2588         if (!check_slab(s, page) ||
2589                         !on_freelist(s, page, NULL))
2590                 return 0;
2591
2592         /* Now we know that a valid freelist exists */
2593         bitmap_zero(map, s->objects);
2594
2595         for(p = page->freelist; p; p = get_freepointer(s, p)) {
2596                 set_bit((p - addr) / s->size, map);
2597                 if (!check_object(s, page, p, 0))
2598                         return 0;
2599         }
2600
2601         for(p = addr; p < addr + s->objects * s->size; p += s->size)
2602                 if (!test_bit((p - addr) / s->size, map))
2603                         if (!check_object(s, page, p, 1))
2604                                 return 0;
2605         return 1;
2606 }
2607
2608 static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2609 {
2610         if (slab_trylock(page)) {
2611                 validate_slab(s, page);
2612                 slab_unlock(page);
2613         } else
2614                 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
2615                         s->name, page);
2616
2617         if (s->flags & DEBUG_DEFAULT_FLAGS) {
2618                 if (!PageError(page))
2619                         printk(KERN_ERR "SLUB %s: PageError not set "
2620                                 "on slab 0x%p\n", s->name, page);
2621         } else {
2622                 if (PageError(page))
2623                         printk(KERN_ERR "SLUB %s: PageError set on "
2624                                 "slab 0x%p\n", s->name, page);
2625         }
2626 }
2627
2628 static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
2629 {
2630         unsigned long count = 0;
2631         struct page *page;
2632         unsigned long flags;
2633
2634         spin_lock_irqsave(&n->list_lock, flags);
2635
2636         list_for_each_entry(page, &n->partial, lru) {
2637                 validate_slab_slab(s, page);
2638                 count++;
2639         }
2640         if (count != n->nr_partial)
2641                 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
2642                         "counter=%ld\n", s->name, count, n->nr_partial);
2643
2644         if (!(s->flags & SLAB_STORE_USER))
2645                 goto out;
2646
2647         list_for_each_entry(page, &n->full, lru) {
2648                 validate_slab_slab(s, page);
2649                 count++;
2650         }
2651         if (count != atomic_long_read(&n->nr_slabs))
2652                 printk(KERN_ERR "SLUB: %s %ld slabs counted but "
2653                         "counter=%ld\n", s->name, count,
2654                         atomic_long_read(&n->nr_slabs));
2655
2656 out:
2657         spin_unlock_irqrestore(&n->list_lock, flags);
2658         return count;
2659 }
2660
2661 static unsigned long validate_slab_cache(struct kmem_cache *s)
2662 {
2663         int node;
2664         unsigned long count = 0;
2665
2666         flush_all(s);
2667         for_each_online_node(node) {
2668                 struct kmem_cache_node *n = get_node(s, node);
2669
2670                 count += validate_slab_node(s, n);
2671         }
2672         return count;
2673 }
2674
2675 /*
2676  * Generate lists of code addresses where slabcache objects are allocated
2677  * and freed.
2678  */
2679
2680 struct location {
2681         unsigned long count;
2682         void *addr;
2683 };
2684
2685 struct loc_track {
2686         unsigned long max;
2687         unsigned long count;
2688         struct location *loc;
2689 };
2690
2691 static void free_loc_track(struct loc_track *t)
2692 {
2693         if (t->max)
2694                 free_pages((unsigned long)t->loc,
2695                         get_order(sizeof(struct location) * t->max));
2696 }
2697
2698 static int alloc_loc_track(struct loc_track *t, unsigned long max)
2699 {
2700         struct location *l;
2701         int order;
2702
2703         if (!max)
2704                 max = PAGE_SIZE / sizeof(struct location);
2705
2706         order = get_order(sizeof(struct location) * max);
2707
2708         l = (void *)__get_free_pages(GFP_KERNEL, order);
2709
2710         if (!l)
2711                 return 0;
2712
2713         if (t->count) {
2714                 memcpy(l, t->loc, sizeof(struct location) * t->count);
2715                 free_loc_track(t);
2716         }
2717         t->max = max;
2718         t->loc = l;
2719         return 1;
2720 }
2721
2722 static int add_location(struct loc_track *t, struct kmem_cache *s,
2723                                                 void *addr)
2724 {
2725         long start, end, pos;
2726         struct location *l;
2727         void *caddr;
2728
2729         start = -1;
2730         end = t->count;
2731
2732         for ( ; ; ) {
2733                 pos = start + (end - start + 1) / 2;
2734
2735                 /*
2736                  * There is nothing at "end". If we end up there
2737                  * we need to add something to before end.
2738                  */
2739                 if (pos == end)
2740                         break;
2741
2742                 caddr = t->loc[pos].addr;
2743                 if (addr == caddr) {
2744                         t->loc[pos].count++;
2745                         return 1;
2746                 }
2747
2748                 if (addr < caddr)
2749                         end = pos;
2750                 else
2751                         start = pos;
2752         }
2753
2754         /*
2755          * Not found. Insert new tracking element.
2756          */
2757         if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2758                 return 0;
2759
2760         l = t->loc + pos;
2761         if (pos < t->count)
2762                 memmove(l + 1, l,
2763                         (t->count - pos) * sizeof(struct location));
2764         t->count++;
2765         l->count = 1;
2766         l->addr = addr;
2767         return 1;
2768 }
2769
2770 static void process_slab(struct loc_track *t, struct kmem_cache *s,
2771                 struct page *page, enum track_item alloc)
2772 {
2773         void *addr = page_address(page);
2774         unsigned long map[BITS_TO_LONGS(s->objects)];
2775         void *p;
2776
2777         bitmap_zero(map, s->objects);
2778         for (p = page->freelist; p; p = get_freepointer(s, p))
2779                 set_bit((p - addr) / s->size, map);
2780
2781         for (p = addr; p < addr + s->objects * s->size; p += s->size)
2782                 if (!test_bit((p - addr) / s->size, map)) {
2783                         void *addr = get_track(s, p, alloc)->addr;
2784
2785                         add_location(t, s, addr);
2786                 }
2787 }
2788
2789 static int list_locations(struct kmem_cache *s, char *buf,
2790                                         enum track_item alloc)
2791 {
2792         int n = 0;
2793         unsigned long i;
2794         struct loc_track t;
2795         int node;
2796
2797         t.count = 0;
2798         t.max = 0;
2799
2800         /* Push back cpu slabs */
2801         flush_all(s);
2802
2803         for_each_online_node(node) {
2804                 struct kmem_cache_node *n = get_node(s, node);
2805                 unsigned long flags;
2806                 struct page *page;
2807
2808                 if (!atomic_read(&n->nr_slabs))
2809                         continue;
2810
2811                 spin_lock_irqsave(&n->list_lock, flags);
2812                 list_for_each_entry(page, &n->partial, lru)
2813                         process_slab(&t, s, page, alloc);
2814                 list_for_each_entry(page, &n->full, lru)
2815                         process_slab(&t, s, page, alloc);
2816                 spin_unlock_irqrestore(&n->list_lock, flags);
2817         }
2818
2819         for (i = 0; i < t.count; i++) {
2820                 void *addr = t.loc[i].addr;
2821
2822                 if (n > PAGE_SIZE - 100)
2823                         break;
2824                 n += sprintf(buf + n, "%7ld ", t.loc[i].count);
2825                 if (addr)
2826                         n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
2827                 else
2828                         n += sprintf(buf + n, "<not-available>");
2829                 n += sprintf(buf + n, "\n");
2830         }
2831
2832         free_loc_track(&t);
2833         if (!t.count)
2834                 n += sprintf(buf, "No data\n");
2835         return n;
2836 }
2837
2838 static unsigned long count_partial(struct kmem_cache_node *n)
2839 {
2840         unsigned long flags;
2841         unsigned long x = 0;
2842         struct page *page;
2843
2844         spin_lock_irqsave(&n->list_lock, flags);
2845         list_for_each_entry(page, &n->partial, lru)
2846                 x += page->inuse;
2847         spin_unlock_irqrestore(&n->list_lock, flags);
2848         return x;
2849 }
2850
2851 enum slab_stat_type {
2852         SL_FULL,
2853         SL_PARTIAL,
2854         SL_CPU,
2855         SL_OBJECTS
2856 };
2857
2858 #define SO_FULL         (1 << SL_FULL)
2859 #define SO_PARTIAL      (1 << SL_PARTIAL)
2860 #define SO_CPU          (1 << SL_CPU)
2861 #define SO_OBJECTS      (1 << SL_OBJECTS)
2862
2863 static unsigned long slab_objects(struct kmem_cache *s,
2864                         char *buf, unsigned long flags)
2865 {
2866         unsigned long total = 0;
2867         int cpu;
2868         int node;
2869         int x;
2870         unsigned long *nodes;
2871         unsigned long *per_cpu;
2872
2873         nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
2874         per_cpu = nodes + nr_node_ids;
2875
2876         for_each_possible_cpu(cpu) {
2877                 struct page *page = s->cpu_slab[cpu];
2878                 int node;
2879
2880                 if (page) {
2881                         node = page_to_nid(page);
2882                         if (flags & SO_CPU) {
2883                                 int x = 0;
2884
2885                                 if (flags & SO_OBJECTS)
2886                                         x = page->inuse;
2887                                 else
2888                                         x = 1;
2889                                 total += x;
2890                                 nodes[node] += x;
2891                         }
2892                         per_cpu[node]++;
2893                 }
2894         }
2895
2896         for_each_online_node(node) {
2897                 struct kmem_cache_node *n = get_node(s, node);
2898
2899                 if (flags & SO_PARTIAL) {
2900                         if (flags & SO_OBJECTS)
2901                                 x = count_partial(n);
2902                         else
2903                                 x = n->nr_partial;
2904                         total += x;
2905                         nodes[node] += x;
2906                 }
2907
2908                 if (flags & SO_FULL) {
2909                         int full_slabs = atomic_read(&n->nr_slabs)
2910                                         - per_cpu[node]
2911                                         - n->nr_partial;
2912
2913                         if (flags & SO_OBJECTS)
2914                                 x = full_slabs * s->objects;
2915                         else
2916                                 x = full_slabs;
2917                         total += x;
2918                         nodes[node] += x;
2919                 }
2920         }
2921
2922         x = sprintf(buf, "%lu", total);
2923 #ifdef CONFIG_NUMA
2924         for_each_online_node(node)
2925                 if (nodes[node])
2926                         x += sprintf(buf + x, " N%d=%lu",
2927                                         node, nodes[node]);
2928 #endif
2929         kfree(nodes);
2930         return x + sprintf(buf + x, "\n");
2931 }
2932
2933 static int any_slab_objects(struct kmem_cache *s)
2934 {
2935         int node;
2936         int cpu;
2937
2938         for_each_possible_cpu(cpu)
2939                 if (s->cpu_slab[cpu])
2940                         return 1;
2941
2942         for_each_node(node) {
2943                 struct kmem_cache_node *n = get_node(s, node);
2944
2945                 if (n->nr_partial || atomic_read(&n->nr_slabs))
2946                         return 1;
2947         }
2948         return 0;
2949 }
2950
2951 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
2952 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
2953
2954 struct slab_attribute {
2955         struct attribute attr;
2956         ssize_t (*show)(struct kmem_cache *s, char *buf);
2957         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
2958 };
2959
2960 #define SLAB_ATTR_RO(_name) \
2961         static struct slab_attribute _name##_attr = __ATTR_RO(_name)
2962
2963 #define SLAB_ATTR(_name) \
2964         static struct slab_attribute _name##_attr =  \
2965         __ATTR(_name, 0644, _name##_show, _name##_store)
2966
2967 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
2968 {
2969         return sprintf(buf, "%d\n", s->size);
2970 }
2971 SLAB_ATTR_RO(slab_size);
2972
2973 static ssize_t align_show(struct kmem_cache *s, char *buf)
2974 {
2975         return sprintf(buf, "%d\n", s->align);
2976 }
2977 SLAB_ATTR_RO(align);
2978
2979 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
2980 {
2981         return sprintf(buf, "%d\n", s->objsize);
2982 }
2983 SLAB_ATTR_RO(object_size);
2984
2985 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
2986 {
2987         return sprintf(buf, "%d\n", s->objects);
2988 }
2989 SLAB_ATTR_RO(objs_per_slab);
2990
2991 static ssize_t order_show(struct kmem_cache *s, char *buf)
2992 {
2993         return sprintf(buf, "%d\n", s->order);
2994 }
2995 SLAB_ATTR_RO(order);
2996
2997 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
2998 {
2999         if (s->ctor) {
3000                 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3001
3002                 return n + sprintf(buf + n, "\n");
3003         }
3004         return 0;
3005 }
3006 SLAB_ATTR_RO(ctor);
3007
3008 static ssize_t dtor_show(struct kmem_cache *s, char *buf)
3009 {
3010         if (s->dtor) {
3011                 int n = sprint_symbol(buf, (unsigned long)s->dtor);
3012
3013                 return n + sprintf(buf + n, "\n");
3014         }
3015         return 0;
3016 }
3017 SLAB_ATTR_RO(dtor);
3018
3019 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3020 {
3021         return sprintf(buf, "%d\n", s->refcount - 1);
3022 }
3023 SLAB_ATTR_RO(aliases);
3024
3025 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3026 {
3027         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3028 }
3029 SLAB_ATTR_RO(slabs);
3030
3031 static ssize_t partial_show(struct kmem_cache *s, char *buf)
3032 {
3033         return slab_objects(s, buf, SO_PARTIAL);
3034 }
3035 SLAB_ATTR_RO(partial);
3036
3037 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3038 {
3039         return slab_objects(s, buf, SO_CPU);
3040 }
3041 SLAB_ATTR_RO(cpu_slabs);
3042
3043 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3044 {
3045         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3046 }
3047 SLAB_ATTR_RO(objects);
3048
3049 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3050 {
3051         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3052 }
3053
3054 static ssize_t sanity_checks_store(struct kmem_cache *s,
3055                                 const char *buf, size_t length)
3056 {
3057         s->flags &= ~SLAB_DEBUG_FREE;
3058         if (buf[0] == '1')
3059                 s->flags |= SLAB_DEBUG_FREE;
3060         return length;
3061 }
3062 SLAB_ATTR(sanity_checks);
3063
3064 static ssize_t trace_show(struct kmem_cache *s, char *buf)
3065 {
3066         return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3067 }
3068
3069 static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3070                                                         size_t length)
3071 {
3072         s->flags &= ~SLAB_TRACE;
3073         if (buf[0] == '1')
3074                 s->flags |= SLAB_TRACE;
3075         return length;
3076 }
3077 SLAB_ATTR(trace);
3078
3079 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3080 {
3081         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3082 }
3083
3084 static ssize_t reclaim_account_store(struct kmem_cache *s,
3085                                 const char *buf, size_t length)
3086 {
3087         s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3088         if (buf[0] == '1')
3089                 s->flags |= SLAB_RECLAIM_ACCOUNT;
3090         return length;
3091 }
3092 SLAB_ATTR(reclaim_account);
3093
3094 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3095 {
3096         return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3097 }
3098 SLAB_ATTR_RO(hwcache_align);
3099
3100 #ifdef CONFIG_ZONE_DMA
3101 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3102 {
3103         return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3104 }
3105 SLAB_ATTR_RO(cache_dma);
3106 #endif
3107
3108 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3109 {
3110         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3111 }
3112 SLAB_ATTR_RO(destroy_by_rcu);
3113
3114 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3115 {
3116         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3117 }
3118
3119 static ssize_t red_zone_store(struct kmem_cache *s,
3120                                 const char *buf, size_t length)
3121 {
3122         if (any_slab_objects(s))
3123                 return -EBUSY;
3124
3125         s->flags &= ~SLAB_RED_ZONE;
3126         if (buf[0] == '1')
3127                 s->flags |= SLAB_RED_ZONE;
3128         calculate_sizes(s);
3129         return length;
3130 }
3131 SLAB_ATTR(red_zone);
3132
3133 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3134 {
3135         return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3136 }
3137
3138 static ssize_t poison_store(struct kmem_cache *s,
3139                                 const char *buf, size_t length)
3140 {
3141         if (any_slab_objects(s))
3142                 return -EBUSY;
3143
3144         s->flags &= ~SLAB_POISON;
3145         if (buf[0] == '1')
3146                 s->flags |= SLAB_POISON;
3147         calculate_sizes(s);
3148         return length;
3149 }
3150 SLAB_ATTR(poison);
3151
3152 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3153 {
3154         return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3155 }
3156
3157 static ssize_t store_user_store(struct kmem_cache *s,
3158                                 const char *buf, size_t length)
3159 {
3160         if (any_slab_objects(s))
3161                 return -EBUSY;
3162
3163         s->flags &= ~SLAB_STORE_USER;
3164         if (buf[0] == '1')
3165                 s->flags |= SLAB_STORE_USER;
3166         calculate_sizes(s);
3167         return length;
3168 }
3169 SLAB_ATTR(store_user);
3170
3171 static ssize_t validate_show(struct kmem_cache *s, char *buf)
3172 {
3173         return 0;
3174 }
3175
3176 static ssize_t validate_store(struct kmem_cache *s,
3177                         const char *buf, size_t length)
3178 {
3179         if (buf[0] == '1')
3180                 validate_slab_cache(s);
3181         else
3182                 return -EINVAL;
3183         return length;
3184 }
3185 SLAB_ATTR(validate);
3186
3187 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
3188 {
3189         return 0;
3190 }
3191
3192 static ssize_t shrink_store(struct kmem_cache *s,
3193                         const char *buf, size_t length)
3194 {
3195         if (buf[0] == '1') {
3196                 int rc = kmem_cache_shrink(s);
3197
3198                 if (rc)
3199                         return rc;
3200         } else
3201                 return -EINVAL;
3202         return length;
3203 }
3204 SLAB_ATTR(shrink);
3205
3206 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
3207 {
3208         if (!(s->flags & SLAB_STORE_USER))
3209                 return -ENOSYS;
3210         return list_locations(s, buf, TRACK_ALLOC);
3211 }
3212 SLAB_ATTR_RO(alloc_calls);
3213
3214 static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3215 {
3216         if (!(s->flags & SLAB_STORE_USER))
3217                 return -ENOSYS;
3218         return list_locations(s, buf, TRACK_FREE);
3219 }
3220 SLAB_ATTR_RO(free_calls);
3221
3222 #ifdef CONFIG_NUMA
3223 static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
3224 {
3225         return sprintf(buf, "%d\n", s->defrag_ratio / 10);
3226 }
3227
3228 static ssize_t defrag_ratio_store(struct kmem_cache *s,
3229                                 const char *buf, size_t length)
3230 {
3231         int n = simple_strtoul(buf, NULL, 10);
3232
3233         if (n < 100)
3234                 s->defrag_ratio = n * 10;
3235         return length;
3236 }
3237 SLAB_ATTR(defrag_ratio);
3238 #endif
3239
3240 static struct attribute * slab_attrs[] = {
3241         &slab_size_attr.attr,
3242         &object_size_attr.attr,
3243         &objs_per_slab_attr.attr,
3244         &order_attr.attr,
3245         &objects_attr.attr,
3246         &slabs_attr.attr,
3247         &partial_attr.attr,
3248         &cpu_slabs_attr.attr,
3249         &ctor_attr.attr,
3250         &dtor_attr.attr,
3251         &aliases_attr.attr,
3252         &align_attr.attr,
3253         &sanity_checks_attr.attr,
3254         &trace_attr.attr,
3255         &hwcache_align_attr.attr,
3256         &reclaim_account_attr.attr,
3257         &destroy_by_rcu_attr.attr,
3258         &red_zone_attr.attr,
3259         &poison_attr.attr,
3260         &store_user_attr.attr,
3261         &validate_attr.attr,
3262         &shrink_attr.attr,
3263         &alloc_calls_attr.attr,
3264         &free_calls_attr.attr,
3265 #ifdef CONFIG_ZONE_DMA
3266         &cache_dma_attr.attr,
3267 #endif
3268 #ifdef CONFIG_NUMA
3269         &defrag_ratio_attr.attr,
3270 #endif
3271         NULL
3272 };
3273
3274 static struct attribute_group slab_attr_group = {
3275         .attrs = slab_attrs,
3276 };
3277
3278 static ssize_t slab_attr_show(struct kobject *kobj,
3279                                 struct attribute *attr,
3280                                 char *buf)
3281 {
3282         struct slab_attribute *attribute;
3283         struct kmem_cache *s;
3284         int err;
3285
3286         attribute = to_slab_attr(attr);
3287         s = to_slab(kobj);
3288
3289         if (!attribute->show)
3290                 return -EIO;
3291
3292         err = attribute->show(s, buf);
3293
3294         return err;
3295 }
3296
3297 static ssize_t slab_attr_store(struct kobject *kobj,
3298                                 struct attribute *attr,
3299                                 const char *buf, size_t len)
3300 {
3301         struct slab_attribute *attribute;
3302         struct kmem_cache *s;
3303         int err;
3304
3305         attribute = to_slab_attr(attr);
3306         s = to_slab(kobj);
3307
3308         if (!attribute->store)
3309                 return -EIO;
3310
3311         err = attribute->store(s, buf, len);
3312
3313         return err;
3314 }
3315
3316 static struct sysfs_ops slab_sysfs_ops = {
3317         .show = slab_attr_show,
3318         .store = slab_attr_store,
3319 };
3320
3321 static struct kobj_type slab_ktype = {
3322         .sysfs_ops = &slab_sysfs_ops,
3323 };
3324
3325 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3326 {
3327         struct kobj_type *ktype = get_ktype(kobj);
3328
3329         if (ktype == &slab_ktype)
3330                 return 1;
3331         return 0;
3332 }
3333
3334 static struct kset_uevent_ops slab_uevent_ops = {
3335         .filter = uevent_filter,
3336 };
3337
3338 decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
3339
3340 #define ID_STR_LENGTH 64
3341
3342 /* Create a unique string id for a slab cache:
3343  * format
3344  * :[flags-]size:[memory address of kmemcache]
3345  */
3346 static char *create_unique_id(struct kmem_cache *s)
3347 {
3348         char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
3349         char *p = name;
3350
3351         BUG_ON(!name);
3352
3353         *p++ = ':';
3354         /*
3355          * First flags affecting slabcache operations. We will only
3356          * get here for aliasable slabs so we do not need to support
3357          * too many flags. The flags here must cover all flags that
3358          * are matched during merging to guarantee that the id is
3359          * unique.
3360          */
3361         if (s->flags & SLAB_CACHE_DMA)
3362                 *p++ = 'd';
3363         if (s->flags & SLAB_RECLAIM_ACCOUNT)
3364                 *p++ = 'a';
3365         if (s->flags & SLAB_DEBUG_FREE)
3366                 *p++ = 'F';
3367         if (p != name + 1)
3368                 *p++ = '-';
3369         p += sprintf(p, "%07d", s->size);
3370         BUG_ON(p > name + ID_STR_LENGTH - 1);
3371         return name;
3372 }
3373
3374 static int sysfs_slab_add(struct kmem_cache *s)
3375 {
3376         int err;
3377         const char *name;
3378         int unmergeable;
3379
3380         if (slab_state < SYSFS)
3381                 /* Defer until later */
3382                 return 0;
3383
3384         unmergeable = slab_unmergeable(s);
3385         if (unmergeable) {
3386                 /*
3387                  * Slabcache can never be merged so we can use the name proper.
3388                  * This is typically the case for debug situations. In that
3389                  * case we can catch duplicate names easily.
3390                  */
3391                 sysfs_remove_link(&slab_subsys.kobj, s->name);
3392                 name = s->name;
3393         } else {
3394                 /*
3395                  * Create a unique name for the slab as a target
3396                  * for the symlinks.
3397                  */
3398                 name = create_unique_id(s);
3399         }
3400
3401         kobj_set_kset_s(s, slab_subsys);
3402         kobject_set_name(&s->kobj, name);
3403         kobject_init(&s->kobj);
3404         err = kobject_add(&s->kobj);
3405         if (err)
3406                 return err;
3407
3408         err = sysfs_create_group(&s->kobj, &slab_attr_group);
3409         if (err)
3410                 return err;
3411         kobject_uevent(&s->kobj, KOBJ_ADD);
3412         if (!unmergeable) {
3413                 /* Setup first alias */
3414                 sysfs_slab_alias(s, s->name);
3415                 kfree(name);
3416         }
3417         return 0;
3418 }
3419
3420 static void sysfs_slab_remove(struct kmem_cache *s)
3421 {
3422         kobject_uevent(&s->kobj, KOBJ_REMOVE);
3423         kobject_del(&s->kobj);
3424 }
3425
3426 /*
3427  * Need to buffer aliases during bootup until sysfs becomes
3428  * available lest we loose that information.
3429  */
3430 struct saved_alias {
3431         struct kmem_cache *s;
3432         const char *name;
3433         struct saved_alias *next;
3434 };
3435
3436 struct saved_alias *alias_list;
3437
3438 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3439 {
3440         struct saved_alias *al;
3441
3442         if (slab_state == SYSFS) {
3443                 /*
3444                  * If we have a leftover link then remove it.
3445                  */
3446                 sysfs_remove_link(&slab_subsys.kobj, name);
3447                 return sysfs_create_link(&slab_subsys.kobj,
3448                                                 &s->kobj, name);
3449         }
3450
3451         al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
3452         if (!al)
3453                 return -ENOMEM;
3454
3455         al->s = s;
3456         al->name = name;
3457         al->next = alias_list;
3458         alias_list = al;
3459         return 0;
3460 }
3461
3462 static int __init slab_sysfs_init(void)
3463 {
3464         struct list_head *h;
3465         int err;
3466
3467         err = subsystem_register(&slab_subsys);
3468         if (err) {
3469                 printk(KERN_ERR "Cannot register slab subsystem.\n");
3470                 return -ENOSYS;
3471         }
3472
3473         slab_state = SYSFS;
3474
3475         list_for_each(h, &slab_caches) {
3476                 struct kmem_cache *s =
3477                         container_of(h, struct kmem_cache, list);
3478
3479                 err = sysfs_slab_add(s);
3480                 BUG_ON(err);
3481         }
3482
3483         while (alias_list) {
3484                 struct saved_alias *al = alias_list;
3485
3486                 alias_list = alias_list->next;
3487                 err = sysfs_slab_alias(al->s, al->name);
3488                 BUG_ON(err);
3489                 kfree(al);
3490         }
3491
3492         resiliency_test();
3493         return 0;
3494 }
3495
3496 __initcall(slab_sysfs_init);
3497 #endif