mm/slab.c

   1 /*
   2  * linux/mm/slab.c
   3  * Written by Mark Hemment, 1996/97.
   4  * (markhe@nextd.demon.co.uk)
   5  *
   6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7  *
   8  * Major cleanup, different bufctl logic, per-cpu arrays
   9  *      (c) 2000 Manfred Spraul
  10  *
  11  * Cleanup, make the head arrays unconditional, preparation for NUMA
  12  *      (c) 2002 Manfred Spraul
  13  *
  14  * An implementation of the Slab Allocator as described in outline in;
  15  *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16  *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17  * or with a little more detail in;
  18  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19  *      Jeff Bonwick (Sun Microsystems).
  20  *      Presented at: USENIX Summer 1994 Technical Conference
  21  *
  22  * The memory is organized in caches, one cache for each object type.
  23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24  * Each cache consists out of many slabs (they are small (usually one
  25  * page long) and always contiguous), and each slab contains multiple
  26  * initialized objects.
  27  *
  28  * This means, that your constructor is used only for newly allocated
  29  * slabs and you must pass objects with the same intializations to
  30  * kmem_cache_free.
  31  *
  32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33  * normal). If you need a special memory type, then must create a new
  34  * cache for that memory type.
  35  *
  36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37  *   full slabs with 0 free objects
  38  *   partial slabs
  39  *   empty slabs with no allocated objects
  40  *
  41  * If partial slabs exist, then new allocations come from these slabs,
  42  * otherwise from empty slabs or new slabs are allocated.
  43  *
  44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46  *
  47  * Each cache has a short per-cpu head array, most allocs
  48  * and frees go into that array, and if that array overflows, then 1/2
  49  * of the entries in the array are given back into the global cache.
  50  * The head array is strictly LIFO and should improve the cache hit rates.
  51  * On SMP, it additionally reduces the spinlock operations.
  52  *
  53  * The c_cpuarray may not be read with enabled local interrupts -
  54  * it's changed with a smp_call_function().
  55  *
  56  * SMP synchronization:
  57  *  constructors and destructors are called without any locking.
  58  *  Several members in kmem_cache_t and struct slab never change, they
  59  *      are accessed without any locking.
  60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61  *      and local interrupts are disabled so slab code is preempt-safe.
  62  *  The non-constant members are protected with a per-cache irq spinlock.
  63  *
  64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65  * in 2000 - many ideas in the current implementation are derived from
  66  * his patch.
  67  *
  68  * Further notes from the original documentation:
  69  *
  70  * 11 April '97.  Started multi-threading - markhe
  71  *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
  72  *      The sem is only needed when accessing/extending the cache-chain, which
  73  *      can never happen inside an interrupt (kmem_cache_create(),
  74  *      kmem_cache_shrink() and kmem_cache_reap()).
  75  *
  76  *      At present, each engine can be growing a cache.  This should be blocked.
  77  *
  78  */
  79
  80 #include        <linux/config.h>
  81 #include        <linux/slab.h>
  82 #include        <linux/mm.h>
  83 #include        <linux/swap.h>
  84 #include        <linux/cache.h>
  85 #include        <linux/interrupt.h>
  86 #include        <linux/init.h>
  87 #include        <linux/compiler.h>
  88 #include        <linux/seq_file.h>
  89 #include        <linux/notifier.h>
  90 #include        <linux/kallsyms.h>
  91 #include        <linux/cpu.h>
  92 #include        <linux/sysctl.h>
  93 #include        <linux/module.h>
  94 #include        <linux/rcupdate.h>
  95 #include        <linux/string.h>
  96
  97 #include        <asm/uaccess.h>
  98 #include        <asm/cacheflush.h>
  99 #include        <asm/tlbflush.h>
 100 #include        <asm/page.h>
 101
 102 /*
 103  * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
 104  *                SLAB_RED_ZONE & SLAB_POISON.
 105  *                0 for faster, smaller code (especially in the critical paths).
 106  *
 107  * STATS        - 1 to collect stats for /proc/slabinfo.
 108  *                0 for faster, smaller code (especially in the critical paths).
 109  *
 110  * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 111  */
 112
 113 #ifdef CONFIG_DEBUG_SLAB
 114 #define DEBUG           1
 115 #define STATS           1
 116 #define FORCED_DEBUG    1
 117 #else
 118 #define DEBUG           0
 119 #define STATS           0
 120 #define FORCED_DEBUG    0
 121 #endif
 122
 123
 124 /* Shouldn't this be in a header file somewhere? */
 125 #define BYTES_PER_WORD          sizeof(void *)
 126
 127 #ifndef cache_line_size
 128 #define cache_line_size()       L1_CACHE_BYTES
 129 #endif
 130
 131 #ifndef ARCH_KMALLOC_MINALIGN
 132 /*
 133  * Enforce a minimum alignment for the kmalloc caches.
 134  * Usually, the kmalloc caches are cache_line_size() aligned, except when
 135  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
 136  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 137  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
 138  * Note that this flag disables some debug features.
 139  */
 140 #define ARCH_KMALLOC_MINALIGN 0
 141 #endif
 142
 143 #ifndef ARCH_SLAB_MINALIGN
 144 /*
 145  * Enforce a minimum alignment for all caches.
 146  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
 147  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
 148  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
 149  * some debug features.
 150  */
 151 #define ARCH_SLAB_MINALIGN 0
 152 #endif
 153
 154 #ifndef ARCH_KMALLOC_FLAGS
 155 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 156 #endif
 157
 158 /* Legal flag mask for kmem_cache_create(). */
 159 #if DEBUG
 160 # define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 161                          SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 162                          SLAB_NO_REAP | SLAB_CACHE_DMA | \
 163                          SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 164                          SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 165                          SLAB_DESTROY_BY_RCU)
 166 #else
 167 # define CREATE_MASK    (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 168                          SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 169                          SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 170                          SLAB_DESTROY_BY_RCU)
 171 #endif
 172
 173 /*
 174  * kmem_bufctl_t:
 175  *
 176  * Bufctl's are used for linking objs within a slab
 177  * linked offsets.
 178  *
 179  * This implementation relies on "struct page" for locating the cache &
 180  * slab an object belongs to.
 181  * This allows the bufctl structure to be small (one int), but limits
 182  * the number of objects a slab (not a cache) can contain when off-slab
 183  * bufctls are used. The limit is the size of the largest general cache
 184  * that does not use off-slab slabs.
 185  * For 32bit archs with 4 kB pages, is this 56.
 186  * This is not serious, as it is only for large objects, when it is unwise
 187  * to have too many per slab.
 188  * Note: This limit can be raised by introducing a general cache whose size
 189  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 190  */
 191
 192 #define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 193 #define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 194 #define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-2)
 195
 196 /* Max number of objs-per-slab for caches which use off-slab slabs.
 197  * Needed to avoid a possible looping condition in cache_grow().
 198  */
 199 static unsigned long offslab_limit;
 200
 201 /*
 202  * struct slab
 203  *
 204  * Manages the objs in a slab. Placed either at the beginning of mem allocated
 205  * for a slab, or allocated from an general cache.
 206  * Slabs are chained into three list: fully used, partial, fully free slabs.
 207  */
 208 struct slab {
 209         struct list_head        list;
 210         unsigned long           colouroff;
 211         void                    *s_mem;         /* including colour offset */
 212         unsigned int            inuse;          /* num of objs active in slab */
 213         kmem_bufctl_t           free;
 214 };
 215
 216 /*
 217  * struct slab_rcu
 218  *
 219  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 220  * arrange for kmem_freepages to be called via RCU.  This is useful if
 221  * we need to approach a kernel structure obliquely, from its address
 222  * obtained without the usual locking.  We can lock the structure to
 223  * stabilize it and check it's still at the given address, only if we
 224  * can be sure that the memory has not been meanwhile reused for some
 225  * other kind of object (which our subsystem's lock might corrupt).
 226  *
 227  * rcu_read_lock before reading the address, then rcu_read_unlock after
 228  * taking the spinlock within the structure expected at that address.
 229  *
 230  * We assume struct slab_rcu can overlay struct slab when destroying.
 231  */
 232 struct slab_rcu {
 233         struct rcu_head         head;
 234         kmem_cache_t            *cachep;
 235         void                    *addr;
 236 };
 237
 238 /*
 239  * struct array_cache
 240  *
 241  * Per cpu structures
 242  * Purpose:
 243  * - LIFO ordering, to hand out cache-warm objects from _alloc
 244  * - reduce the number of linked list operations
 245  * - reduce spinlock operations
 246  *
 247  * The limit is stored in the per-cpu structure to reduce the data cache
 248  * footprint.
 249  *
 250  */
 251 struct array_cache {
 252         unsigned int avail;
 253         unsigned int limit;
 254         unsigned int batchcount;
 255         unsigned int touched;
 256 };
 257
 258 /* bootstrap: The caches do not work without cpuarrays anymore,
 259  * but the cpuarrays are allocated from the generic caches...
 260  */
 261 #define BOOT_CPUCACHE_ENTRIES   1
 262 struct arraycache_init {
 263         struct array_cache cache;
 264         void * entries[BOOT_CPUCACHE_ENTRIES];
 265 };
 266
 267 /*
 268  * The slab lists of all objects.
 269  * Hopefully reduce the internal fragmentation
 270  * NUMA: The spinlock could be moved from the kmem_cache_t
 271  * into this structure, too. Figure out what causes
 272  * fewer cross-node spinlock operations.
 273  */
 274 struct kmem_list3 {
 275         struct list_head        slabs_partial;  /* partial list first, better asm code */
 276         struct list_head        slabs_full;
 277         struct list_head        slabs_free;
 278         unsigned long   free_objects;
 279         int             free_touched;
 280         unsigned long   next_reap;
 281         struct array_cache      *shared;
 282 };
 283
 284 #define LIST3_INIT(parent) \
 285         { \
 286                 .slabs_full     = LIST_HEAD_INIT(parent.slabs_full), \
 287                 .slabs_partial  = LIST_HEAD_INIT(parent.slabs_partial), \
 288                 .slabs_free     = LIST_HEAD_INIT(parent.slabs_free) \
 289         }
 290 #define list3_data(cachep) \
 291         (&(cachep)->lists)
 292
 293 /* NUMA: per-node */
 294 #define list3_data_ptr(cachep, ptr) \
 295                 list3_data(cachep)
 296
 297 /*
 298  * kmem_cache_t
 299  *
 300  * manages a cache.
 301  */
 302
 303 struct kmem_cache_s {
 304 /* 1) per-cpu data, touched during every alloc/free */
 305         struct array_cache      *array[NR_CPUS];
 306         unsigned int            batchcount;
 307         unsigned int            limit;
 308 /* 2) touched by every alloc & free from the backend */
 309         struct kmem_list3       lists;
 310         /* NUMA: kmem_3list_t   *nodelists[MAX_NUMNODES] */
 311         unsigned int            objsize;
 312         unsigned int            flags;  /* constant flags */
 313         unsigned int            num;    /* # of objs per slab */
 314         unsigned int            free_limit; /* upper limit of objects in the lists */
 315         spinlock_t              spinlock;
 316
 317 /* 3) cache_grow/shrink */
 318         /* order of pgs per slab (2^n) */
 319         unsigned int            gfporder;
 320
 321         /* force GFP flags, e.g. GFP_DMA */
 322         unsigned int            gfpflags;
 323
 324         size_t                  colour;         /* cache colouring range */
 325         unsigned int            colour_off;     /* colour offset */
 326         unsigned int            colour_next;    /* cache colouring */
 327         kmem_cache_t            *slabp_cache;
 328         unsigned int            slab_size;
 329         unsigned int            dflags;         /* dynamic flags */
 330
 331         /* constructor func */
 332         void (*ctor)(void *, kmem_cache_t *, unsigned long);
 333
 334         /* de-constructor func */
 335         void (*dtor)(void *, kmem_cache_t *, unsigned long);
 336
 337 /* 4) cache creation/removal */
 338         const char              *name;
 339         struct list_head        next;
 340
 341 /* 5) statistics */
 342 #if STATS
 343         unsigned long           num_active;
 344         unsigned long           num_allocations;
 345         unsigned long           high_mark;
 346         unsigned long           grown;
 347         unsigned long           reaped;
 348         unsigned long           errors;
 349         unsigned long           max_freeable;
 350         unsigned long           node_allocs;
 351         atomic_t                allochit;
 352         atomic_t                allocmiss;
 353         atomic_t                freehit;
 354         atomic_t                freemiss;
 355 #endif
 356 #if DEBUG
 357         int                     dbghead;
 358         int                     reallen;
 359 #endif
 360 };
 361
 362 #define CFLGS_OFF_SLAB          (0x80000000UL)
 363 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 364
 365 #define BATCHREFILL_LIMIT       16
 366 /* Optimization question: fewer reaps means less
 367  * probability for unnessary cpucache drain/refill cycles.
 368  *
 369  * OTHO the cpuarrays can contain lots of objects,
 370  * which could lock up otherwise freeable slabs.
 371  */
 372 #define REAPTIMEOUT_CPUC        (2*HZ)
 373 #define REAPTIMEOUT_LIST3       (4*HZ)
 374
 375 #if STATS
 376 #define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 377 #define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 378 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 379 #define STATS_INC_GROWN(x)      ((x)->grown++)
 380 #define STATS_INC_REAPED(x)     ((x)->reaped++)
 381 #define STATS_SET_HIGH(x)       do { if ((x)->num_active > (x)->high_mark) \
 382                                         (x)->high_mark = (x)->num_active; \
 383                                 } while (0)
 384 #define STATS_INC_ERR(x)        ((x)->errors++)
 385 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 386 #define STATS_SET_FREEABLE(x, i) \
 387                                 do { if ((x)->max_freeable < i) \
 388                                         (x)->max_freeable = i; \
 389                                 } while (0)
 390
 391 #define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 392 #define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 393 #define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 394 #define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 395 #else
 396 #define STATS_INC_ACTIVE(x)     do { } while (0)
 397 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 398 #define STATS_INC_ALLOCED(x)    do { } while (0)
 399 #define STATS_INC_GROWN(x)      do { } while (0)
 400 #define STATS_INC_REAPED(x)     do { } while (0)
 401 #define STATS_SET_HIGH(x)       do { } while (0)
 402 #define STATS_INC_ERR(x)        do { } while (0)
 403 #define STATS_INC_NODEALLOCS(x) do { } while (0)
 404 #define STATS_SET_FREEABLE(x, i) \
 405                                 do { } while (0)
 406
 407 #define STATS_INC_ALLOCHIT(x)   do { } while (0)
 408 #define STATS_INC_ALLOCMISS(x)  do { } while (0)
 409 #define STATS_INC_FREEHIT(x)    do { } while (0)
 410 #define STATS_INC_FREEMISS(x)   do { } while (0)
 411 #endif
 412
 413 #if DEBUG
 414 /* Magic nums for obj red zoning.
 415  * Placed in the first word before and the first word after an obj.
 416  */
 417 #define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
 418 #define RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
 419
 420 /* ...and for poisoning */
 421 #define POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
 422 #define POISON_FREE     0x6b    /* for use-after-free poisoning */
 423 #define POISON_END      0xa5    /* end-byte of poisoning */
 424
 425 /* memory layout of objects:
 426  * 0            : objp
 427  * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
 428  *              the end of an object is aligned with the end of the real
 429  *              allocation. Catches writes behind the end of the allocation.
 430  * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
 431  *              redzone word.
 432  * cachep->dbghead: The real object.
 433  * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 434  * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
 435  */
 436 static int obj_dbghead(kmem_cache_t *cachep)
 437 {
 438         return cachep->dbghead;
 439 }
 440
 441 static int obj_reallen(kmem_cache_t *cachep)
 442 {
 443         return cachep->reallen;
 444 }
 445
 446 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
 447 {
 448         BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 449         return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
 450 }
 451
 452 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 453 {
 454         BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 455         if (cachep->flags & SLAB_STORE_USER)
 456                 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
 457         return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
 458 }
 459
 460 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 461 {
 462         BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 463         return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
 464 }
 465
 466 #else
 467
 468 #define obj_dbghead(x)                  0
 469 #define obj_reallen(cachep)             (cachep->objsize)
 470 #define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 471 #define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 472 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 473
 474 #endif
 475
 476 /*
 477  * Maximum size of an obj (in 2^order pages)
 478  * and absolute limit for the gfp order.
 479  */
 480 #if defined(CONFIG_LARGE_ALLOCS)
 481 #define MAX_OBJ_ORDER   13      /* up to 32Mb */
 482 #define MAX_GFP_ORDER   13      /* up to 32Mb */
 483 #elif defined(CONFIG_MMU)
 484 #define MAX_OBJ_ORDER   5       /* 32 pages */
 485 #define MAX_GFP_ORDER   5       /* 32 pages */
 486 #else
 487 #define MAX_OBJ_ORDER   8       /* up to 1Mb */
 488 #define MAX_GFP_ORDER   8       /* up to 1Mb */
 489 #endif
 490
 491 /*
 492  * Do not go above this order unless 0 objects fit into the slab.
 493  */
 494 #define BREAK_GFP_ORDER_HI      1
 495 #define BREAK_GFP_ORDER_LO      0
 496 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 497
 498 /* Macros for storing/retrieving the cachep and or slab from the
 499  * global 'mem_map'. These are used to find the slab an obj belongs to.
 500  * With kfree(), these are used to find the cache which an obj belongs to.
 501  */
 502 #define SET_PAGE_CACHE(pg,x)  ((pg)->lru.next = (struct list_head *)(x))
 503 #define GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->lru.next)
 504 #define SET_PAGE_SLAB(pg,x)   ((pg)->lru.prev = (struct list_head *)(x))
 505 #define GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->lru.prev)
 506
 507 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 508 struct cache_sizes malloc_sizes[] = {
 509 #define CACHE(x) { .cs_size = (x) },
 510 #include <linux/kmalloc_sizes.h>
 511         CACHE(ULONG_MAX)
 512 #undef CACHE
 513 };
 514 EXPORT_SYMBOL(malloc_sizes);
 515
 516 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
 517 struct cache_names {
 518         char *name;
 519         char *name_dma;
 520 };
 521
 522 static struct cache_names __initdata cache_names[] = {
 523 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 524 #include <linux/kmalloc_sizes.h>
 525         { NULL, }
 526 #undef CACHE
 527 };
 528
 529 static struct arraycache_init initarray_cache __initdata =
 530         { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 531 static struct arraycache_init initarray_generic =
 532         { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 533
 534 /* internal cache of cache description objs */
 535 static kmem_cache_t cache_cache = {
 536         .lists          = LIST3_INIT(cache_cache.lists),
 537         .batchcount     = 1,
 538         .limit          = BOOT_CPUCACHE_ENTRIES,
 539         .objsize        = sizeof(kmem_cache_t),
 540         .flags          = SLAB_NO_REAP,
 541         .spinlock       = SPIN_LOCK_UNLOCKED,
 542         .name           = "kmem_cache",
 543 #if DEBUG
 544         .reallen        = sizeof(kmem_cache_t),
 545 #endif
 546 };
 547
 548 /* Guard access to the cache-chain. */
 549 static struct semaphore cache_chain_sem;
 550 static struct list_head cache_chain;
 551
 552 /*
 553  * vm_enough_memory() looks at this to determine how many
 554  * slab-allocated pages are possibly freeable under pressure
 555  *
 556  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 557  */
 558 atomic_t slab_reclaim_pages;
 559 EXPORT_SYMBOL(slab_reclaim_pages);
 560
 561 /*
 562  * chicken and egg problem: delay the per-cpu array allocation
 563  * until the general caches are up.
 564  */
 565 static enum {
 566         NONE,
 567         PARTIAL,
 568         FULL
 569 } g_cpucache_up;
 570
 571 static DEFINE_PER_CPU(struct work_struct, reap_work);
 572
 573 static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 574 static void enable_cpucache (kmem_cache_t *cachep);
 575 static void cache_reap (void *unused);
 576
 577 static inline void **ac_entry(struct array_cache *ac)
 578 {
 579         return (void**)(ac+1);
 580 }
 581
 582 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 583 {
 584         return cachep->array[smp_processor_id()];
 585 }
 586
 587 static inline kmem_cache_t *__find_general_cachep(size_t size,
 588                                                 unsigned int __nocast gfpflags)
 589 {
 590         struct cache_sizes *csizep = malloc_sizes;
 591
 592 #if DEBUG
 593         /* This happens if someone tries to call
 594         * kmem_cache_create(), or __kmalloc(), before
 595         * the generic caches are initialized.
 596         */
 597         BUG_ON(csizep->cs_cachep == NULL);
 598 #endif
 599         while (size > csizep->cs_size)
 600                 csizep++;
 601
 602         /*
 603          * Really subtile: The last entry with cs->cs_size==ULONG_MAX
 604          * has cs_{dma,}cachep==NULL. Thus no special case
 605          * for large kmalloc calls required.
 606          */
 607         if (unlikely(gfpflags & GFP_DMA))
 608                 return csizep->cs_dmacachep;
 609         return csizep->cs_cachep;
 610 }
 611
 612 kmem_cache_t *kmem_find_general_cachep(size_t size,
 613                 unsigned int __nocast gfpflags)
 614 {
 615         return __find_general_cachep(size, gfpflags);
 616 }
 617 EXPORT_SYMBOL(kmem_find_general_cachep);
 618
 619 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 620 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 621                  int flags, size_t *left_over, unsigned int *num)
 622 {
 623         int i;
 624         size_t wastage = PAGE_SIZE<<gfporder;
 625         size_t extra = 0;
 626         size_t base = 0;
 627
 628         if (!(flags & CFLGS_OFF_SLAB)) {
 629                 base = sizeof(struct slab);
 630                 extra = sizeof(kmem_bufctl_t);
 631         }
 632         i = 0;
 633         while (i*size + ALIGN(base+i*extra, align) <= wastage)
 634                 i++;
 635         if (i > 0)
 636                 i--;
 637
 638         if (i > SLAB_LIMIT)
 639                 i = SLAB_LIMIT;
 640
 641         *num = i;
 642         wastage -= i*size;
 643         wastage -= ALIGN(base+i*extra, align);
 644         *left_over = wastage;
 645 }
 646
 647 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 648
 649 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 650 {
 651         printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 652                 function, cachep->name, msg);
 653         dump_stack();
 654 }
 655
 656 /*
 657  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 658  * via the workqueue/eventd.
 659  * Add the CPU number into the expiration time to minimize the possibility of
 660  * the CPUs getting into lockstep and contending for the global cache chain
 661  * lock.
 662  */
 663 static void __devinit start_cpu_timer(int cpu)
 664 {
 665         struct work_struct *reap_work = &per_cpu(reap_work, cpu);
 666
 667         /*
 668          * When this gets called from do_initcalls via cpucache_init(),
 669          * init_workqueues() has already run, so keventd will be setup
 670          * at that time.
 671          */
 672         if (keventd_up() && reap_work->func == NULL) {
 673                 INIT_WORK(reap_work, cache_reap, NULL);
 674                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 675         }
 676 }
 677
 678 static struct array_cache *alloc_arraycache(int cpu, int entries,
 679                                                 int batchcount)
 680 {
 681         int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
 682         struct array_cache *nc = NULL;
 683
 684         if (cpu == -1)
 685                 nc = kmalloc(memsize, GFP_KERNEL);
 686         else
 687                 nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
 688
 689         if (nc) {
 690                 nc->avail = 0;
 691                 nc->limit = entries;
 692                 nc->batchcount = batchcount;
 693                 nc->touched = 0;
 694         }
 695         return nc;
 696 }
 697
 698 static int __devinit cpuup_callback(struct notifier_block *nfb,
 699                                   unsigned long action, void *hcpu)
 700 {
 701         long cpu = (long)hcpu;
 702         kmem_cache_t* cachep;
 703
 704         switch (action) {
 705         case CPU_UP_PREPARE:
 706                 down(&cache_chain_sem);
 707                 list_for_each_entry(cachep, &cache_chain, next) {
 708                         struct array_cache *nc;
 709
 710                         nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
 711                         if (!nc)
 712                                 goto bad;
 713
 714                         spin_lock_irq(&cachep->spinlock);
 715                         cachep->array[cpu] = nc;
 716                         cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
 717                                                 + cachep->num;
 718                         spin_unlock_irq(&cachep->spinlock);
 719
 720                 }
 721                 up(&cache_chain_sem);
 722                 break;
 723         case CPU_ONLINE:
 724                 start_cpu_timer(cpu);
 725                 break;
 726 #ifdef CONFIG_HOTPLUG_CPU
 727         case CPU_DEAD:
 728                 /* fall thru */
 729         case CPU_UP_CANCELED:
 730                 down(&cache_chain_sem);
 731
 732                 list_for_each_entry(cachep, &cache_chain, next) {
 733                         struct array_cache *nc;
 734
 735                         spin_lock_irq(&cachep->spinlock);
 736                         /* cpu is dead; no one can alloc from it. */
 737                         nc = cachep->array[cpu];
 738                         cachep->array[cpu] = NULL;
 739                         cachep->free_limit -= cachep->batchcount;
 740                         free_block(cachep, ac_entry(nc), nc->avail);
 741                         spin_unlock_irq(&cachep->spinlock);
 742                         kfree(nc);
 743                 }
 744                 up(&cache_chain_sem);
 745                 break;
 746 #endif
 747         }
 748         return NOTIFY_OK;
 749 bad:
 750         up(&cache_chain_sem);
 751         return NOTIFY_BAD;
 752 }
 753
 754 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 755
 756 /* Initialisation.
 757  * Called after the gfp() functions have been enabled, and before smp_init().
 758  */
 759 void __init kmem_cache_init(void)
 760 {
 761         size_t left_over;
 762         struct cache_sizes *sizes;
 763         struct cache_names *names;
 764
 765         /*
 766          * Fragmentation resistance on low memory - only use bigger
 767          * page orders on machines with more than 32MB of memory.
 768          */
 769         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 770                 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 771
 772
 773         /* Bootstrap is tricky, because several objects are allocated
 774          * from caches that do not exist yet:
 775          * 1) initialize the cache_cache cache: it contains the kmem_cache_t
 776          *    structures of all caches, except cache_cache itself: cache_cache
 777          *    is statically allocated.
 778          *    Initially an __init data area is used for the head array, it's
 779          *    replaced with a kmalloc allocated array at the end of the bootstrap.
 780          * 2) Create the first kmalloc cache.
 781          *    The kmem_cache_t for the new cache is allocated normally. An __init
 782          *    data area is used for the head array.
 783          * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
 784          * 4) Replace the __init data head arrays for cache_cache and the first
 785          *    kmalloc cache with kmalloc allocated arrays.
 786          * 5) Resize the head arrays of the kmalloc caches to their final sizes.
 787          */
 788
 789         /* 1) create the cache_cache */
 790         init_MUTEX(&cache_chain_sem);
 791         INIT_LIST_HEAD(&cache_chain);
 792         list_add(&cache_cache.next, &cache_chain);
 793         cache_cache.colour_off = cache_line_size();
 794         cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 795
 796         cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 797
 798         cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
 799                                 &left_over, &cache_cache.num);
 800         if (!cache_cache.num)
 801                 BUG();
 802
 803         cache_cache.colour = left_over/cache_cache.colour_off;
 804         cache_cache.colour_next = 0;
 805         cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
 806                                 sizeof(struct slab), cache_line_size());
 807
 808         /* 2+3) create the kmalloc caches */
 809         sizes = malloc_sizes;
 810         names = cache_names;
 811
 812         while (sizes->cs_size != ULONG_MAX) {
 813                 /* For performance, all the general caches are L1 aligned.
 814                  * This should be particularly beneficial on SMP boxes, as it
 815                  * eliminates "false sharing".
 816                  * Note for systems short on memory removing the alignment will
 817                  * allow tighter packing of the smaller caches. */
 818                 sizes->cs_cachep = kmem_cache_create(names->name,
 819                         sizes->cs_size, ARCH_KMALLOC_MINALIGN,
 820                         (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
 821
 822                 /* Inc off-slab bufctl limit until the ceiling is hit. */
 823                 if (!(OFF_SLAB(sizes->cs_cachep))) {
 824                         offslab_limit = sizes->cs_size-sizeof(struct slab);
 825                         offslab_limit /= sizeof(kmem_bufctl_t);
 826                 }
 827
 828                 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 829                         sizes->cs_size, ARCH_KMALLOC_MINALIGN,
 830                         (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
 831                         NULL, NULL);
 832
 833                 sizes++;
 834                 names++;
 835         }
 836         /* 4) Replace the bootstrap head arrays */
 837         {
 838                 void * ptr;
 839
 840                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 841                 local_irq_disable();
 842                 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
 843                 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
 844                 cache_cache.array[smp_processor_id()] = ptr;
 845                 local_irq_enable();
 846
 847                 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 848                 local_irq_disable();
 849                 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
 850                 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
 851                                 sizeof(struct arraycache_init));
 852                 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
 853                 local_irq_enable();
 854         }
 855
 856         /* 5) resize the head arrays to their final sizes */
 857         {
 858                 kmem_cache_t *cachep;
 859                 down(&cache_chain_sem);
 860                 list_for_each_entry(cachep, &cache_chain, next)
 861                         enable_cpucache(cachep);
 862                 up(&cache_chain_sem);
 863         }
 864
 865         /* Done! */
 866         g_cpucache_up = FULL;
 867
 868         /* Register a cpu startup notifier callback
 869          * that initializes ac_data for all new cpus
 870          */
 871         register_cpu_notifier(&cpucache_notifier);
 872
 873
 874         /* The reap timers are started later, with a module init call:
 875          * That part of the kernel is not yet operational.
 876          */
 877 }
 878
 879 static int __init cpucache_init(void)
 880 {
 881         int cpu;
 882
 883         /*
 884          * Register the timers that return unneeded
 885          * pages to gfp.
 886          */
 887         for (cpu = 0; cpu < NR_CPUS; cpu++) {
 888                 if (cpu_online(cpu))
 889                         start_cpu_timer(cpu);
 890         }
 891
 892         return 0;
 893 }
 894
 895 __initcall(cpucache_init);
 896
 897 /*
 898  * Interface to system's page allocator. No need to hold the cache-lock.
 899  *
 900  * If we requested dmaable memory, we will get it. Even if we
 901  * did not request dmaable memory, we might get it, but that
 902  * would be relatively rare and ignorable.
 903  */
 904 static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
 905 {
 906         struct page *page;
 907         void *addr;
 908         int i;
 909
 910         flags |= cachep->gfpflags;
 911         if (likely(nodeid == -1)) {
 912                 page = alloc_pages(flags, cachep->gfporder);
 913         } else {
 914                 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 915         }
 916         if (!page)
 917                 return NULL;
 918         addr = page_address(page);
 919
 920         i = (1 << cachep->gfporder);
 921         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 922                 atomic_add(i, &slab_reclaim_pages);
 923         add_page_state(nr_slab, i);
 924         while (i--) {
 925                 SetPageSlab(page);
 926                 page++;
 927         }
 928         return addr;
 929 }
 930
 931 /*
 932  * Interface to system's page release.
 933  */
 934 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 935 {
 936         unsigned long i = (1<<cachep->gfporder);
 937         struct page *page = virt_to_page(addr);
 938         const unsigned long nr_freed = i;
 939
 940         while (i--) {
 941                 if (!TestClearPageSlab(page))
 942                         BUG();
 943                 page++;
 944         }
 945         sub_page_state(nr_slab, nr_freed);
 946         if (current->reclaim_state)
 947                 current->reclaim_state->reclaimed_slab += nr_freed;
 948         free_pages((unsigned long)addr, cachep->gfporder);
 949         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 950                 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 951 }
 952
 953 static void kmem_rcu_free(struct rcu_head *head)
 954 {
 955         struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
 956         kmem_cache_t *cachep = slab_rcu->cachep;
 957
 958         kmem_freepages(cachep, slab_rcu->addr);
 959         if (OFF_SLAB(cachep))
 960                 kmem_cache_free(cachep->slabp_cache, slab_rcu);
 961 }
 962
 963 #if DEBUG
 964
 965 #ifdef CONFIG_DEBUG_PAGEALLOC
 966 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 967                                 unsigned long caller)
 968 {
 969         int size = obj_reallen(cachep);
 970
 971         addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
 972
 973         if (size < 5*sizeof(unsigned long))
 974                 return;
 975
 976         *addr++=0x12345678;
 977         *addr++=caller;
 978         *addr++=smp_processor_id();
 979         size -= 3*sizeof(unsigned long);
 980         {
 981                 unsigned long *sptr = &caller;
 982                 unsigned long svalue;
 983
 984                 while (!kstack_end(sptr)) {
 985                         svalue = *sptr++;
 986                         if (kernel_text_address(svalue)) {
 987                                 *addr++=svalue;
 988                                 size -= sizeof(unsigned long);
 989                                 if (size <= sizeof(unsigned long))
 990                                         break;
 991                         }
 992                 }
 993
 994         }
 995         *addr++=0x87654321;
 996 }
 997 #endif
 998
 999 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1000 {
1001         int size = obj_reallen(cachep);
1002         addr = &((char*)addr)[obj_dbghead(cachep)];
1003
1004         memset(addr, val, size);
1005         *(unsigned char *)(addr+size-1) = POISON_END;
1006 }
1007
1008 static void dump_line(char *data, int offset, int limit)
1009 {
1010         int i;
1011         printk(KERN_ERR "%03x:", offset);
1012         for (i=0;i<limit;i++) {
1013                 printk(" %02x", (unsigned char)data[offset+i]);
1014         }
1015         printk("\n");
1016 }
1017 #endif
1018
1019 #if DEBUG
1020
1021 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1022 {
1023         int i, size;
1024         char *realobj;
1025
1026         if (cachep->flags & SLAB_RED_ZONE) {
1027                 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1028                         *dbg_redzone1(cachep, objp),
1029                         *dbg_redzone2(cachep, objp));
1030         }
1031
1032         if (cachep->flags & SLAB_STORE_USER) {
1033                 printk(KERN_ERR "Last user: [<%p>]",
1034                                 *dbg_userword(cachep, objp));
1035                 print_symbol("(%s)",
1036                                 (unsigned long)*dbg_userword(cachep, objp));
1037                 printk("\n");
1038         }
1039         realobj = (char*)objp+obj_dbghead(cachep);
1040         size = obj_reallen(cachep);
1041         for (i=0; i<size && lines;i+=16, lines--) {
1042                 int limit;
1043                 limit = 16;
1044                 if (i+limit > size)
1045                         limit = size-i;
1046                 dump_line(realobj, i, limit);
1047         }
1048 }
1049
1050 static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1051 {
1052         char *realobj;
1053         int size, i;
1054         int lines = 0;
1055
1056         realobj = (char*)objp+obj_dbghead(cachep);
1057         size = obj_reallen(cachep);
1058
1059         for (i=0;i<size;i++) {
1060                 char exp = POISON_FREE;
1061                 if (i == size-1)
1062                         exp = POISON_END;
1063                 if (realobj[i] != exp) {
1064                         int limit;
1065                         /* Mismatch ! */
1066                         /* Print header */
1067                         if (lines == 0) {
1068                                 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
1069                                                 realobj, size);
1070                                 print_objinfo(cachep, objp, 0);
1071                         }
1072                         /* Hexdump the affected line */
1073                         i = (i/16)*16;
1074                         limit = 16;
1075                         if (i+limit > size)
1076                                 limit = size-i;
1077                         dump_line(realobj, i, limit);
1078                         i += 16;
1079                         lines++;
1080                         /* Limit to 5 lines */
1081                         if (lines > 5)
1082                                 break;
1083                 }
1084         }
1085         if (lines != 0) {
1086                 /* Print some data about the neighboring objects, if they
1087                  * exist:
1088                  */
1089                 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
1090                 int objnr;
1091
1092                 objnr = (objp-slabp->s_mem)/cachep->objsize;
1093                 if (objnr) {
1094                         objp = slabp->s_mem+(objnr-1)*cachep->objsize;
1095                         realobj = (char*)objp+obj_dbghead(cachep);
1096                         printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1097                                                 realobj, size);
1098                         print_objinfo(cachep, objp, 2);
1099                 }
1100                 if (objnr+1 < cachep->num) {
1101                         objp = slabp->s_mem+(objnr+1)*cachep->objsize;
1102                         realobj = (char*)objp+obj_dbghead(cachep);
1103                         printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1104                                                 realobj, size);
1105                         print_objinfo(cachep, objp, 2);
1106                 }
1107         }
1108 }
1109 #endif
1110
1111 /* Destroy all the objs in a slab, and release the mem back to the system.
1112  * Before calling the slab must have been unlinked from the cache.
1113  * The cache-lock is not held/needed.
1114  */
1115 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1116 {
1117         void *addr = slabp->s_mem - slabp->colouroff;
1118
1119 #if DEBUG
1120         int i;
1121         for (i = 0; i < cachep->num; i++) {
1122                 void *objp = slabp->s_mem + cachep->objsize * i;
1123
1124                 if (cachep->flags & SLAB_POISON) {
1125 #ifdef CONFIG_DEBUG_PAGEALLOC
1126                         if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
1127                                 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
1128                         else
1129                                 check_poison_obj(cachep, objp);
1130 #else
1131                         check_poison_obj(cachep, objp);
1132 #endif
1133                 }
1134                 if (cachep->flags & SLAB_RED_ZONE) {
1135                         if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1136                                 slab_error(cachep, "start of a freed object "
1137                                                         "was overwritten");
1138                         if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1139                                 slab_error(cachep, "end of a freed object "
1140                                                         "was overwritten");
1141                 }
1142                 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1143                         (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
1144         }
1145 #else
1146         if (cachep->dtor) {
1147                 int i;
1148                 for (i = 0; i < cachep->num; i++) {
1149                         void* objp = slabp->s_mem+cachep->objsize*i;
1150                         (cachep->dtor)(objp, cachep, 0);
1151                 }
1152         }
1153 #endif
1154
1155         if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1156                 struct slab_rcu *slab_rcu;
1157
1158                 slab_rcu = (struct slab_rcu *) slabp;
1159                 slab_rcu->cachep = cachep;
1160                 slab_rcu->addr = addr;
1161                 call_rcu(&slab_rcu->head, kmem_rcu_free);
1162         } else {
1163                 kmem_freepages(cachep, addr);
1164                 if (OFF_SLAB(cachep))
1165                         kmem_cache_free(cachep->slabp_cache, slabp);
1166         }
1167 }
1168
1169 /**
1170  * kmem_cache_create - Create a cache.
1171  * @name: A string which is used in /proc/slabinfo to identify this cache.
1172  * @size: The size of objects to be created in this cache.
1173  * @align: The required alignment for the objects.
1174  * @flags: SLAB flags
1175  * @ctor: A constructor for the objects.
1176  * @dtor: A destructor for the objects.
1177  *
1178  * Returns a ptr to the cache on success, NULL on failure.
1179  * Cannot be called within a int, but can be interrupted.
1180  * The @ctor is run when new pages are allocated by the cache
1181  * and the @dtor is run before the pages are handed back.
1182  *
1183  * @name must be valid until the cache is destroyed. This implies that
1184  * the module calling this has to destroy the cache before getting
1185  * unloaded.
1186  *
1187  * The flags are
1188  *
1189  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1190  * to catch references to uninitialised memory.
1191  *
1192  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1193  * for buffer overruns.
1194  *
1195  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1196  * memory pressure.
1197  *
1198  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1199  * cacheline.  This can be beneficial if you're counting cycles as closely
1200  * as davem.
1201  */
1202 kmem_cache_t *
1203 kmem_cache_create (const char *name, size_t size, size_t align,
1204         unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
1205         void (*dtor)(void*, kmem_cache_t *, unsigned long))
1206 {
1207         size_t left_over, slab_size, ralign;
1208         kmem_cache_t *cachep = NULL;
1209
1210         /*
1211          * Sanity checks... these are all serious usage bugs.
1212          */
1213         if ((!name) ||
1214                 in_interrupt() ||
1215                 (size < BYTES_PER_WORD) ||
1216                 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
1217                 (dtor && !ctor)) {
1218                         printk(KERN_ERR "%s: Early error in slab %s\n",
1219                                         __FUNCTION__, name);
1220                         BUG();
1221                 }
1222
1223 #if DEBUG
1224         WARN_ON(strchr(name, ' '));     /* It confuses parsers */
1225         if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1226                 /* No constructor, but inital state check requested */
1227                 printk(KERN_ERR "%s: No con, but init state check "
1228                                 "requested - %s\n", __FUNCTION__, name);
1229                 flags &= ~SLAB_DEBUG_INITIAL;
1230         }
1231
1232 #if FORCED_DEBUG
1233         /*
1234          * Enable redzoning and last user accounting, except for caches with
1235          * large objects, if the increased size would increase the object size
1236          * above the next power of two: caches with object sizes just above a
1237          * power of two have a significant amount of internal fragmentation.
1238          */
1239         if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
1240                 flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
1241         if (!(flags & SLAB_DESTROY_BY_RCU))
1242                 flags |= SLAB_POISON;
1243 #endif
1244         if (flags & SLAB_DESTROY_BY_RCU)
1245                 BUG_ON(flags & SLAB_POISON);
1246 #endif
1247         if (flags & SLAB_DESTROY_BY_RCU)
1248                 BUG_ON(dtor);
1249
1250         /*
1251          * Always checks flags, a caller might be expecting debug
1252          * support which isn't available.
1253          */
1254         if (flags & ~CREATE_MASK)
1255                 BUG();
1256
1257         /* Check that size is in terms of words.  This is needed to avoid
1258          * unaligned accesses for some archs when redzoning is used, and makes
1259          * sure any on-slab bufctl's are also correctly aligned.
1260          */
1261         if (size & (BYTES_PER_WORD-1)) {
1262                 size += (BYTES_PER_WORD-1);
1263                 size &= ~(BYTES_PER_WORD-1);
1264         }
1265
1266         /* calculate out the final buffer alignment: */
1267         /* 1) arch recommendation: can be overridden for debug */
1268         if (flags & SLAB_HWCACHE_ALIGN) {
1269                 /* Default alignment: as specified by the arch code.
1270                  * Except if an object is really small, then squeeze multiple
1271                  * objects into one cacheline.
1272                  */
1273                 ralign = cache_line_size();
1274                 while (size <= ralign/2)
1275                         ralign /= 2;
1276         } else {
1277                 ralign = BYTES_PER_WORD;
1278         }
1279         /* 2) arch mandated alignment: disables debug if necessary */
1280         if (ralign < ARCH_SLAB_MINALIGN) {
1281                 ralign = ARCH_SLAB_MINALIGN;
1282                 if (ralign > BYTES_PER_WORD)
1283                         flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
1284         }
1285         /* 3) caller mandated alignment: disables debug if necessary */
1286         if (ralign < align) {
1287                 ralign = align;
1288                 if (ralign > BYTES_PER_WORD)
1289                         flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
1290         }
1291         /* 4) Store it. Note that the debug code below can reduce
1292          *    the alignment to BYTES_PER_WORD.
1293          */
1294         align = ralign;
1295
1296         /* Get cache's description obj. */
1297         cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1298         if (!cachep)
1299                 goto opps;
1300         memset(cachep, 0, sizeof(kmem_cache_t));
1301
1302 #if DEBUG
1303         cachep->reallen = size;
1304
1305         if (flags & SLAB_RED_ZONE) {
1306                 /* redzoning only works with word aligned caches */
1307                 align = BYTES_PER_WORD;
1308
1309                 /* add space for red zone words */
1310                 cachep->dbghead += BYTES_PER_WORD;
1311                 size += 2*BYTES_PER_WORD;
1312         }
1313         if (flags & SLAB_STORE_USER) {
1314                 /* user store requires word alignment and
1315                  * one word storage behind the end of the real
1316                  * object.
1317                  */
1318                 align = BYTES_PER_WORD;
1319                 size += BYTES_PER_WORD;
1320         }
1321 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1322         if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1323                 cachep->dbghead += PAGE_SIZE - size;
1324                 size = PAGE_SIZE;
1325         }
1326 #endif
1327 #endif
1328
1329         /* Determine if the slab management is 'on' or 'off' slab. */
1330         if (size >= (PAGE_SIZE>>3))
1331                 /*
1332                  * Size is large, assume best to place the slab management obj
1333                  * off-slab (should allow better packing of objs).
1334                  */
1335                 flags |= CFLGS_OFF_SLAB;
1336
1337         size = ALIGN(size, align);
1338
1339         if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1340                 /*
1341                  * A VFS-reclaimable slab tends to have most allocations
1342                  * as GFP_NOFS and we really don't want to have to be allocating
1343                  * higher-order pages when we are unable to shrink dcache.
1344                  */
1345                 cachep->gfporder = 0;
1346                 cache_estimate(cachep->gfporder, size, align, flags,
1347                                         &left_over, &cachep->num);
1348         } else {
1349                 /*
1350                  * Calculate size (in pages) of slabs, and the num of objs per
1351                  * slab.  This could be made much more intelligent.  For now,
1352                  * try to avoid using high page-orders for slabs.  When the
1353                  * gfp() funcs are more friendly towards high-order requests,
1354                  * this should be changed.
1355                  */
1356                 do {
1357                         unsigned int break_flag = 0;
1358 cal_wastage:
1359                         cache_estimate(cachep->gfporder, size, align, flags,
1360                                                 &left_over, &cachep->num);
1361                         if (break_flag)
1362                                 break;
1363                         if (cachep->gfporder >= MAX_GFP_ORDER)
1364                                 break;
1365                         if (!cachep->num)
1366                                 goto next;
1367                         if (flags & CFLGS_OFF_SLAB &&
1368                                         cachep->num > offslab_limit) {
1369                                 /* This num of objs will cause problems. */
1370                                 cachep->gfporder--;
1371                                 break_flag++;
1372                                 goto cal_wastage;
1373                         }
1374
1375                         /*
1376                          * Large num of objs is good, but v. large slabs are
1377                          * currently bad for the gfp()s.
1378                          */
1379                         if (cachep->gfporder >= slab_break_gfp_order)
1380                                 break;
1381
1382                         if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1383                                 break;  /* Acceptable internal fragmentation. */
1384 next:
1385                         cachep->gfporder++;
1386                 } while (1);
1387         }
1388
1389         if (!cachep->num) {
1390                 printk("kmem_cache_create: couldn't create cache %s.\n", name);
1391                 kmem_cache_free(&cache_cache, cachep);
1392                 cachep = NULL;
1393                 goto opps;
1394         }
1395         slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
1396                                 + sizeof(struct slab), align);
1397
1398         /*
1399          * If the slab has been placed off-slab, and we have enough space then
1400          * move it on-slab. This is at the expense of any extra colouring.
1401          */
1402         if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1403                 flags &= ~CFLGS_OFF_SLAB;
1404                 left_over -= slab_size;
1405         }
1406
1407         if (flags & CFLGS_OFF_SLAB) {
1408                 /* really off slab. No need for manual alignment */
1409                 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
1410         }
1411
1412         cachep->colour_off = cache_line_size();
1413         /* Offset must be a multiple of the alignment. */
1414         if (cachep->colour_off < align)
1415                 cachep->colour_off = align;
1416         cachep->colour = left_over/cachep->colour_off;
1417         cachep->slab_size = slab_size;
1418         cachep->flags = flags;
1419         cachep->gfpflags = 0;
1420         if (flags & SLAB_CACHE_DMA)
1421                 cachep->gfpflags |= GFP_DMA;
1422         spin_lock_init(&cachep->spinlock);
1423         cachep->objsize = size;
1424         /* NUMA */
1425         INIT_LIST_HEAD(&cachep->lists.slabs_full);
1426         INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1427         INIT_LIST_HEAD(&cachep->lists.slabs_free);
1428
1429         if (flags & CFLGS_OFF_SLAB)
1430                 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
1431         cachep->ctor = ctor;
1432         cachep->dtor = dtor;
1433         cachep->name = name;
1434
1435         /* Don't let CPUs to come and go */
1436         lock_cpu_hotplug();
1437
1438         if (g_cpucache_up == FULL) {
1439                 enable_cpucache(cachep);
1440         } else {
1441                 if (g_cpucache_up == NONE) {
1442                         /* Note: the first kmem_cache_create must create
1443                          * the cache that's used by kmalloc(24), otherwise
1444                          * the creation of further caches will BUG().
1445                          */
1446                         cachep->array[smp_processor_id()] = &initarray_generic.cache;
1447                         g_cpucache_up = PARTIAL;
1448                 } else {
1449                         cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
1450                 }
1451                 BUG_ON(!ac_data(cachep));
1452                 ac_data(cachep)->avail = 0;
1453                 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1454                 ac_data(cachep)->batchcount = 1;
1455                 ac_data(cachep)->touched = 0;
1456                 cachep->batchcount = 1;
1457                 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1458                 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1459                                         + cachep->num;
1460         }
1461
1462         cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1463                                         ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1464
1465         /* Need the semaphore to access the chain. */
1466         down(&cache_chain_sem);
1467         {
1468                 struct list_head *p;
1469                 mm_segment_t old_fs;
1470
1471                 old_fs = get_fs();
1472                 set_fs(KERNEL_DS);
1473                 list_for_each(p, &cache_chain) {
1474                         kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1475                         char tmp;
1476                         /* This happens when the module gets unloaded and doesn't
1477                            destroy its slab cache and noone else reuses the vmalloc
1478                            area of the module. Print a warning. */
1479                         if (__get_user(tmp,pc->name)) {
1480                                 printk("SLAB: cache with size %d has lost its name\n",
1481                                         pc->objsize);
1482                                 continue;
1483                         }
1484                         if (!strcmp(pc->name,name)) {
1485                                 printk("kmem_cache_create: duplicate cache %s\n",name);
1486                                 up(&cache_chain_sem);
1487                                 unlock_cpu_hotplug();
1488                                 BUG();
1489                         }
1490                 }
1491                 set_fs(old_fs);
1492         }
1493
1494         /* cache setup completed, link it into the list */
1495         list_add(&cachep->next, &cache_chain);
1496         up(&cache_chain_sem);
1497         unlock_cpu_hotplug();
1498 opps:
1499         if (!cachep && (flags & SLAB_PANIC))
1500                 panic("kmem_cache_create(): failed to create slab `%s'\n",
1501                         name);
1502         return cachep;
1503 }
1504 EXPORT_SYMBOL(kmem_cache_create);
1505
1506 #if DEBUG
1507 static void check_irq_off(void)
1508 {
1509         BUG_ON(!irqs_disabled());
1510 }
1511
1512 static void check_irq_on(void)
1513 {
1514         BUG_ON(irqs_disabled());
1515 }
1516
1517 static void check_spinlock_acquired(kmem_cache_t *cachep)
1518 {
1519 #ifdef CONFIG_SMP
1520         check_irq_off();
1521         BUG_ON(spin_trylock(&cachep->spinlock));
1522 #endif
1523 }
1524 #else
1525 #define check_irq_off() do { } while(0)
1526 #define check_irq_on()  do { } while(0)
1527 #define check_spinlock_acquired(x) do { } while(0)
1528 #endif
1529
1530 /*
1531  * Waits for all CPUs to execute func().
1532  */
1533 static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1534 {
1535         check_irq_on();
1536         preempt_disable();
1537
1538         local_irq_disable();
1539         func(arg);
1540         local_irq_enable();
1541
1542         if (smp_call_function(func, arg, 1, 1))
1543                 BUG();
1544
1545         preempt_enable();
1546 }
1547
1548 static void drain_array_locked(kmem_cache_t* cachep,
1549                                 struct array_cache *ac, int force);
1550
1551 static void do_drain(void *arg)
1552 {
1553         kmem_cache_t *cachep = (kmem_cache_t*)arg;
1554         struct array_cache *ac;
1555
1556         check_irq_off();
1557         ac = ac_data(cachep);
1558         spin_lock(&cachep->spinlock);
1559         free_block(cachep, &ac_entry(ac)[0], ac->avail);
1560         spin_unlock(&cachep->spinlock);
1561         ac->avail = 0;
1562 }
1563
1564 static void drain_cpu_caches(kmem_cache_t *cachep)
1565 {
1566         smp_call_function_all_cpus(do_drain, cachep);
1567         check_irq_on();
1568         spin_lock_irq(&cachep->spinlock);
1569         if (cachep->lists.shared)
1570                 drain_array_locked(cachep, cachep->lists.shared, 1);
1571         spin_unlock_irq(&cachep->spinlock);
1572 }
1573
1574
1575 /* NUMA shrink all list3s */
1576 static int __cache_shrink(kmem_cache_t *cachep)
1577 {
1578         struct slab *slabp;
1579         int ret;
1580
1581         drain_cpu_caches(cachep);
1582
1583         check_irq_on();
1584         spin_lock_irq(&cachep->spinlock);
1585
1586         for(;;) {
1587                 struct list_head *p;
1588
1589                 p = cachep->lists.slabs_free.prev;
1590                 if (p == &cachep->lists.slabs_free)
1591                         break;
1592
1593                 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
1594 #if DEBUG
1595                 if (slabp->inuse)
1596                         BUG();
1597 #endif
1598                 list_del(&slabp->list);
1599
1600                 cachep->lists.free_objects -= cachep->num;
1601                 spin_unlock_irq(&cachep->spinlock);
1602                 slab_destroy(cachep, slabp);
1603                 spin_lock_irq(&cachep->spinlock);
1604         }
1605         ret = !list_empty(&cachep->lists.slabs_full) ||
1606                 !list_empty(&cachep->lists.slabs_partial);
1607         spin_unlock_irq(&cachep->spinlock);
1608         return ret;
1609 }
1610
1611 /**
1612  * kmem_cache_shrink - Shrink a cache.
1613  * @cachep: The cache to shrink.
1614  *
1615  * Releases as many slabs as possible for a cache.
1616  * To help debugging, a zero exit status indicates all slabs were released.
1617  */
1618 int kmem_cache_shrink(kmem_cache_t *cachep)
1619 {
1620         if (!cachep || in_interrupt())
1621                 BUG();
1622
1623         return __cache_shrink(cachep);
1624 }
1625 EXPORT_SYMBOL(kmem_cache_shrink);
1626
1627 /**
1628  * kmem_cache_destroy - delete a cache
1629  * @cachep: the cache to destroy
1630  *
1631  * Remove a kmem_cache_t object from the slab cache.
1632  * Returns 0 on success.
1633  *
1634  * It is expected this function will be called by a module when it is
1635  * unloaded.  This will remove the cache completely, and avoid a duplicate
1636  * cache being allocated each time a module is loaded and unloaded, if the
1637  * module doesn't have persistent in-kernel storage across loads and unloads.
1638  *
1639  * The cache must be empty before calling this function.
1640  *
1641  * The caller must guarantee that noone will allocate memory from the cache
1642  * during the kmem_cache_destroy().
1643  */
1644 int kmem_cache_destroy(kmem_cache_t * cachep)
1645 {
1646         int i;
1647
1648         if (!cachep || in_interrupt())
1649                 BUG();
1650
1651         /* Don't let CPUs to come and go */
1652         lock_cpu_hotplug();
1653
1654         /* Find the cache in the chain of caches. */
1655         down(&cache_chain_sem);
1656         /*
1657          * the chain is never empty, cache_cache is never destroyed
1658          */
1659         list_del(&cachep->next);
1660         up(&cache_chain_sem);
1661
1662         if (__cache_shrink(cachep)) {
1663                 slab_error(cachep, "Can't free all objects");
1664                 down(&cache_chain_sem);
1665                 list_add(&cachep->next,&cache_chain);
1666                 up(&cache_chain_sem);
1667                 unlock_cpu_hotplug();
1668                 return 1;
1669         }
1670
1671         if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1672                 synchronize_rcu();
1673
1674         /* no cpu_online check required here since we clear the percpu
1675          * array on cpu offline and set this to NULL.
1676          */
1677         for (i = 0; i < NR_CPUS; i++)
1678                 kfree(cachep->array[i]);
1679
1680         /* NUMA: free the list3 structures */
1681         kfree(cachep->lists.shared);
1682         cachep->lists.shared = NULL;
1683         kmem_cache_free(&cache_cache, cachep);
1684
1685         unlock_cpu_hotplug();
1686
1687         return 0;
1688 }
1689 EXPORT_SYMBOL(kmem_cache_destroy);
1690
1691 /* Get the memory for a slab management obj. */
1692 static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
1693                         void *objp, int colour_off, unsigned int __nocast local_flags)
1694 {
1695         struct slab *slabp;
1696
1697         if (OFF_SLAB(cachep)) {
1698                 /* Slab management obj is off-slab. */
1699                 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
1700                 if (!slabp)
1701                         return NULL;
1702         } else {
1703                 slabp = objp+colour_off;
1704                 colour_off += cachep->slab_size;
1705         }
1706         slabp->inuse = 0;
1707         slabp->colouroff = colour_off;
1708         slabp->s_mem = objp+colour_off;
1709
1710         return slabp;
1711 }
1712
1713 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
1714 {
1715         return (kmem_bufctl_t *)(slabp+1);
1716 }
1717
1718 static void cache_init_objs(kmem_cache_t *cachep,
1719                         struct slab *slabp, unsigned long ctor_flags)
1720 {
1721         int i;
1722
1723         for (i = 0; i < cachep->num; i++) {
1724                 void* objp = slabp->s_mem+cachep->objsize*i;
1725 #if DEBUG
1726                 /* need to poison the objs? */
1727                 if (cachep->flags & SLAB_POISON)
1728                         poison_obj(cachep, objp, POISON_FREE);
1729                 if (cachep->flags & SLAB_STORE_USER)
1730                         *dbg_userword(cachep, objp) = NULL;
1731
1732                 if (cachep->flags & SLAB_RED_ZONE) {
1733                         *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1734                         *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1735                 }
1736                 /*
1737                  * Constructors are not allowed to allocate memory from
1738                  * the same cache which they are a constructor for.
1739                  * Otherwise, deadlock. They must also be threaded.
1740                  */
1741                 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
1742                         cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
1743
1744                 if (cachep->flags & SLAB_RED_ZONE) {
1745                         if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1746                                 slab_error(cachep, "constructor overwrote the"
1747                                                         " end of an object");
1748                         if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1749                                 slab_error(cachep, "constructor overwrote the"
1750                                                         " start of an object");
1751                 }
1752                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
1753                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1754 #else
1755                 if (cachep->ctor)
1756                         cachep->ctor(objp, cachep, ctor_flags);
1757 #endif
1758                 slab_bufctl(slabp)[i] = i+1;
1759         }
1760         slab_bufctl(slabp)[i-1] = BUFCTL_END;
1761         slabp->free = 0;
1762 }
1763
1764 static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags)
1765 {
1766         if (flags & SLAB_DMA) {
1767                 if (!(cachep->gfpflags & GFP_DMA))
1768                         BUG();
1769         } else {
1770                 if (cachep->gfpflags & GFP_DMA)
1771                         BUG();
1772         }
1773 }
1774
1775 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
1776 {
1777         int i;
1778         struct page *page;
1779
1780         /* Nasty!!!!!! I hope this is OK. */
1781         i = 1 << cachep->gfporder;
1782         page = virt_to_page(objp);
1783         do {
1784                 SET_PAGE_CACHE(page, cachep);
1785                 SET_PAGE_SLAB(page, slabp);
1786                 page++;
1787         } while (--i);
1788 }
1789
1790 /*
1791  * Grow (by 1) the number of slabs within a cache.  This is called by
1792  * kmem_cache_alloc() when there are no active objs left in a cache.
1793  */
1794 static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
1795 {
1796         struct slab     *slabp;
1797         void            *objp;
1798         size_t           offset;
1799         unsigned int     local_flags;
1800         unsigned long    ctor_flags;
1801
1802         /* Be lazy and only check for valid flags here,
1803          * keeping it out of the critical path in kmem_cache_alloc().
1804          */
1805         if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1806                 BUG();
1807         if (flags & SLAB_NO_GROW)
1808                 return 0;
1809
1810         ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1811         local_flags = (flags & SLAB_LEVEL_MASK);
1812         if (!(local_flags & __GFP_WAIT))
1813                 /*
1814                  * Not allowed to sleep.  Need to tell a constructor about
1815                  * this - it might need to know...
1816                  */
1817                 ctor_flags |= SLAB_CTOR_ATOMIC;
1818
1819         /* About to mess with non-constant members - lock. */
1820         check_irq_off();
1821         spin_lock(&cachep->spinlock);
1822
1823         /* Get colour for the slab, and cal the next value. */
1824         offset = cachep->colour_next;
1825         cachep->colour_next++;
1826         if (cachep->colour_next >= cachep->colour)
1827                 cachep->colour_next = 0;
1828         offset *= cachep->colour_off;
1829
1830         spin_unlock(&cachep->spinlock);
1831
1832         if (local_flags & __GFP_WAIT)
1833                 local_irq_enable();
1834
1835         /*
1836          * The test for missing atomic flag is performed here, rather than
1837          * the more obvious place, simply to reduce the critical path length
1838          * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
1839          * will eventually be caught here (where it matters).
1840          */
1841         kmem_flagcheck(cachep, flags);
1842
1843
1844         /* Get mem for the objs. */
1845         if (!(objp = kmem_getpages(cachep, flags, nodeid)))
1846                 goto failed;
1847
1848         /* Get slab management. */
1849         if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1850                 goto opps1;
1851
1852         set_slab_attr(cachep, slabp, objp);
1853
1854         cache_init_objs(cachep, slabp, ctor_flags);
1855
1856         if (local_flags & __GFP_WAIT)
1857                 local_irq_disable();
1858         check_irq_off();
1859         spin_lock(&cachep->spinlock);
1860
1861         /* Make slab active. */
1862         list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
1863         STATS_INC_GROWN(cachep);
1864         list3_data(cachep)->free_objects += cachep->num;
1865         spin_unlock(&cachep->spinlock);
1866         return 1;
1867 opps1:
1868         kmem_freepages(cachep, objp);
1869 failed:
1870         if (local_flags & __GFP_WAIT)
1871                 local_irq_disable();
1872         return 0;
1873 }
1874
1875 #if DEBUG
1876
1877 /*
1878  * Perform extra freeing checks:
1879  * - detect bad pointers.
1880  * - POISON/RED_ZONE checking
1881  * - destructor calls, for caches with POISON+dtor
1882  */
1883 static void kfree_debugcheck(const void *objp)
1884 {
1885         struct page *page;
1886
1887         if (!virt_addr_valid(objp)) {
1888                 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
1889                         (unsigned long)objp);
1890                 BUG();
1891         }
1892         page = virt_to_page(objp);
1893         if (!PageSlab(page)) {
1894                 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
1895                 BUG();
1896         }
1897 }
1898
1899 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
1900                                         void *caller)
1901 {
1902         struct page *page;
1903         unsigned int objnr;
1904         struct slab *slabp;
1905
1906         objp -= obj_dbghead(cachep);
1907         kfree_debugcheck(objp);
1908         page = virt_to_page(objp);
1909
1910         if (GET_PAGE_CACHE(page) != cachep) {
1911                 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
1912                                 GET_PAGE_CACHE(page),cachep);
1913                 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
1914                 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
1915                 WARN_ON(1);
1916         }
1917         slabp = GET_PAGE_SLAB(page);
1918
1919         if (cachep->flags & SLAB_RED_ZONE) {
1920                 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
1921                         slab_error(cachep, "double free, or memory outside"
1922                                                 " object was overwritten");
1923                         printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
1924                                         objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
1925                 }
1926                 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1927                 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1928         }
1929         if (cachep->flags & SLAB_STORE_USER)
1930                 *dbg_userword(cachep, objp) = caller;
1931
1932         objnr = (objp-slabp->s_mem)/cachep->objsize;
1933
1934         BUG_ON(objnr >= cachep->num);
1935         BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
1936
1937         if (cachep->flags & SLAB_DEBUG_INITIAL) {
1938                 /* Need to call the slab's constructor so the
1939                  * caller can perform a verify of its state (debugging).
1940                  * Called without the cache-lock held.
1941                  */
1942                 cachep->ctor(objp+obj_dbghead(cachep),
1943                                         cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1944         }
1945         if (cachep->flags & SLAB_POISON && cachep->dtor) {
1946                 /* we want to cache poison the object,
1947                  * call the destruction callback
1948                  */
1949                 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
1950         }
1951         if (cachep->flags & SLAB_POISON) {
1952 #ifdef CONFIG_DEBUG_PAGEALLOC
1953                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
1954                         store_stackinfo(cachep, objp, (unsigned long)caller);
1955                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1956                 } else {
1957                         poison_obj(cachep, objp, POISON_FREE);
1958                 }
1959 #else
1960                 poison_obj(cachep, objp, POISON_FREE);
1961 #endif
1962         }
1963         return objp;
1964 }
1965
1966 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1967 {
1968         kmem_bufctl_t i;
1969         int entries = 0;
1970
1971         check_spinlock_acquired(cachep);
1972         /* Check slab's freelist to see if this obj is there. */
1973         for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1974                 entries++;
1975                 if (entries > cachep->num || i >= cachep->num)
1976                         goto bad;
1977         }
1978         if (entries != cachep->num - slabp->inuse) {
1979 bad:
1980                 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
1981                                 cachep->name, cachep->num, slabp, slabp->inuse);
1982                 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
1983                         if ((i%16)==0)
1984                                 printk("\n%03x:", i);
1985                         printk(" %02x", ((unsigned char*)slabp)[i]);
1986                 }
1987                 printk("\n");
1988                 BUG();
1989         }
1990 }
1991 #else
1992 #define kfree_debugcheck(x) do { } while(0)
1993 #define cache_free_debugcheck(x,objp,z) (objp)
1994 #define check_slabp(x,y) do { } while(0)
1995 #endif
1996
1997 static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
1998 {
1999         int batchcount;
2000         struct kmem_list3 *l3;
2001         struct array_cache *ac;
2002
2003         check_irq_off();
2004         ac = ac_data(cachep);
2005 retry:
2006         batchcount = ac->batchcount;
2007         if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2008                 /* if there was little recent activity on this
2009                  * cache, then perform only a partial refill.
2010                  * Otherwise we could generate refill bouncing.
2011                  */
2012                 batchcount = BATCHREFILL_LIMIT;
2013         }
2014         l3 = list3_data(cachep);
2015
2016         BUG_ON(ac->avail > 0);
2017         spin_lock(&cachep->spinlock);
2018         if (l3->shared) {
2019                 struct array_cache *shared_array = l3->shared;
2020                 if (shared_array->avail) {
2021                         if (batchcount > shared_array->avail)
2022                                 batchcount = shared_array->avail;
2023                         shared_array->avail -= batchcount;
2024                         ac->avail = batchcount;
2025                         memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
2026                                         sizeof(void*)*batchcount);
2027                         shared_array->touched = 1;
2028                         goto alloc_done;
2029                 }
2030         }
2031         while (batchcount > 0) {
2032                 struct list_head *entry;
2033                 struct slab *slabp;
2034                 /* Get slab alloc is to come from. */
2035                 entry = l3->slabs_partial.next;
2036                 if (entry == &l3->slabs_partial) {
2037                         l3->free_touched = 1;
2038                         entry = l3->slabs_free.next;
2039                         if (entry == &l3->slabs_free)
2040                                 goto must_grow;
2041                 }
2042
2043                 slabp = list_entry(entry, struct slab, list);
2044                 check_slabp(cachep, slabp);
2045                 check_spinlock_acquired(cachep);
2046                 while (slabp->inuse < cachep->num && batchcount--) {
2047                         kmem_bufctl_t next;
2048                         STATS_INC_ALLOCED(cachep);
2049                         STATS_INC_ACTIVE(cachep);
2050                         STATS_SET_HIGH(cachep);
2051
2052                         /* get obj pointer */
2053                         ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
2054
2055                         slabp->inuse++;
2056                         next = slab_bufctl(slabp)[slabp->free];
2057 #if DEBUG
2058                         slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2059 #endif
2060                         slabp->free = next;
2061                 }
2062                 check_slabp(cachep, slabp);
2063
2064                 /* move slabp to correct slabp list: */
2065                 list_del(&slabp->list);
2066                 if (slabp->free == BUFCTL_END)
2067                         list_add(&slabp->list, &l3->slabs_full);
2068                 else
2069                         list_add(&slabp->list, &l3->slabs_partial);
2070         }
2071
2072 must_grow:
2073         l3->free_objects -= ac->avail;
2074 alloc_done:
2075         spin_unlock(&cachep->spinlock);
2076
2077         if (unlikely(!ac->avail)) {
2078                 int x;
2079                 x = cache_grow(cachep, flags, -1);
2080
2081                 // cache_grow can reenable interrupts, then ac could change.
2082                 ac = ac_data(cachep);
2083                 if (!x && ac->avail == 0)       // no objects in sight? abort
2084                         return NULL;
2085
2086                 if (!ac->avail)         // objects refilled by interrupt?
2087                         goto retry;
2088         }
2089         ac->touched = 1;
2090         return ac_entry(ac)[--ac->avail];
2091 }
2092
2093 static inline void
2094 cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
2095 {
2096         might_sleep_if(flags & __GFP_WAIT);
2097 #if DEBUG
2098         kmem_flagcheck(cachep, flags);
2099 #endif
2100 }
2101
2102 #if DEBUG
2103 static void *
2104 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2105                         unsigned int __nocast flags, void *objp, void *caller)
2106 {
2107         if (!objp)
2108                 return objp;
2109         if (cachep->flags & SLAB_POISON) {
2110 #ifdef CONFIG_DEBUG_PAGEALLOC
2111                 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2112                         kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
2113                 else
2114                         check_poison_obj(cachep, objp);
2115 #else
2116                 check_poison_obj(cachep, objp);
2117 #endif
2118                 poison_obj(cachep, objp, POISON_INUSE);
2119         }
2120         if (cachep->flags & SLAB_STORE_USER)
2121                 *dbg_userword(cachep, objp) = caller;
2122
2123         if (cachep->flags & SLAB_RED_ZONE) {
2124                 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2125                         slab_error(cachep, "double free, or memory outside"
2126                                                 " object was overwritten");
2127                         printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2128                                         objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
2129                 }
2130                 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2131                 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2132         }
2133         objp += obj_dbghead(cachep);
2134         if (cachep->ctor && cachep->flags & SLAB_POISON) {
2135                 unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2136
2137                 if (!(flags & __GFP_WAIT))
2138                         ctor_flags |= SLAB_CTOR_ATOMIC;
2139
2140                 cachep->ctor(objp, cachep, ctor_flags);
2141         }
2142         return objp;
2143 }
2144 #else
2145 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2146 #endif
2147
2148
2149 static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
2150 {
2151         unsigned long save_flags;
2152         void* objp;
2153         struct array_cache *ac;
2154
2155         cache_alloc_debugcheck_before(cachep, flags);
2156
2157         local_irq_save(save_flags);
2158         ac = ac_data(cachep);
2159         if (likely(ac->avail)) {
2160                 STATS_INC_ALLOCHIT(cachep);
2161                 ac->touched = 1;
2162                 objp = ac_entry(ac)[--ac->avail];
2163         } else {
2164                 STATS_INC_ALLOCMISS(cachep);
2165                 objp = cache_alloc_refill(cachep, flags);
2166         }
2167         local_irq_restore(save_flags);
2168         objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
2169         return objp;
2170 }
2171
2172 /*
2173  * NUMA: different approach needed if the spinlock is moved into
2174  * the l3 structure
2175  */
2176
2177 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2178 {
2179         int i;
2180
2181         check_spinlock_acquired(cachep);
2182
2183         /* NUMA: move add into loop */
2184         cachep->lists.free_objects += nr_objects;
2185
2186         for (i = 0; i < nr_objects; i++) {
2187                 void *objp = objpp[i];
2188                 struct slab *slabp;
2189                 unsigned int objnr;
2190
2191                 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2192                 list_del(&slabp->list);
2193                 objnr = (objp - slabp->s_mem) / cachep->objsize;
2194                 check_slabp(cachep, slabp);
2195 #if DEBUG
2196                 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2197                         printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
2198                                                 cachep->name, objp);
2199                         BUG();
2200                 }
2201 #endif
2202                 slab_bufctl(slabp)[objnr] = slabp->free;
2203                 slabp->free = objnr;
2204                 STATS_DEC_ACTIVE(cachep);
2205                 slabp->inuse--;
2206                 check_slabp(cachep, slabp);
2207
2208                 /* fixup slab chains */
2209                 if (slabp->inuse == 0) {
2210                         if (cachep->lists.free_objects > cachep->free_limit) {
2211                                 cachep->lists.free_objects -= cachep->num;
2212                                 slab_destroy(cachep, slabp);
2213                         } else {
2214                                 list_add(&slabp->list,
2215                                 &list3_data_ptr(cachep, objp)->slabs_free);
2216                         }
2217                 } else {
2218                         /* Unconditionally move a slab to the end of the
2219                          * partial list on free - maximum time for the
2220                          * other objects to be freed, too.
2221                          */
2222                         list_add_tail(&slabp->list,
2223                                 &list3_data_ptr(cachep, objp)->slabs_partial);
2224                 }
2225         }
2226 }
2227
2228 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2229 {
2230         int batchcount;
2231
2232         batchcount = ac->batchcount;
2233 #if DEBUG
2234         BUG_ON(!batchcount || batchcount > ac->avail);
2235 #endif
2236         check_irq_off();
2237         spin_lock(&cachep->spinlock);
2238         if (cachep->lists.shared) {
2239                 struct array_cache *shared_array = cachep->lists.shared;
2240                 int max = shared_array->limit-shared_array->avail;
2241                 if (max) {
2242                         if (batchcount > max)
2243                                 batchcount = max;
2244                         memcpy(&ac_entry(shared_array)[shared_array->avail],
2245                                         &ac_entry(ac)[0],
2246                                         sizeof(void*)*batchcount);
2247                         shared_array->avail += batchcount;
2248                         goto free_done;
2249                 }
2250         }
2251
2252         free_block(cachep, &ac_entry(ac)[0], batchcount);
2253 free_done:
2254 #if STATS
2255         {
2256                 int i = 0;
2257                 struct list_head *p;
2258
2259                 p = list3_data(cachep)->slabs_free.next;
2260                 while (p != &(list3_data(cachep)->slabs_free)) {
2261                         struct slab *slabp;
2262
2263                         slabp = list_entry(p, struct slab, list);
2264                         BUG_ON(slabp->inuse);
2265
2266                         i++;
2267                         p = p->next;
2268                 }
2269                 STATS_SET_FREEABLE(cachep, i);
2270         }
2271 #endif
2272         spin_unlock(&cachep->spinlock);
2273         ac->avail -= batchcount;
2274         memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
2275                         sizeof(void*)*ac->avail);
2276 }
2277
2278 /*
2279  * __cache_free
2280  * Release an obj back to its cache. If the obj has a constructed
2281  * state, it must be in this state _before_ it is released.
2282  *
2283  * Called with disabled ints.
2284  */
2285 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2286 {
2287         struct array_cache *ac = ac_data(cachep);
2288
2289         check_irq_off();
2290         objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2291
2292         if (likely(ac->avail < ac->limit)) {
2293                 STATS_INC_FREEHIT(cachep);
2294                 ac_entry(ac)[ac->avail++] = objp;
2295                 return;
2296         } else {
2297                 STATS_INC_FREEMISS(cachep);
2298                 cache_flusharray(cachep, ac);
2299                 ac_entry(ac)[ac->avail++] = objp;
2300         }
2301 }
2302
2303 /**
2304  * kmem_cache_alloc - Allocate an object
2305  * @cachep: The cache to allocate from.
2306  * @flags: See kmalloc().
2307  *
2308  * Allocate an object from this cache.  The flags are only relevant
2309  * if the cache has no available objects.
2310  */
2311 void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
2312 {
2313         return __cache_alloc(cachep, flags);
2314 }
2315 EXPORT_SYMBOL(kmem_cache_alloc);
2316
2317 /**
2318  * kmem_ptr_validate - check if an untrusted pointer might
2319  *      be a slab entry.
2320  * @cachep: the cache we're checking against
2321  * @ptr: pointer to validate
2322  *
2323  * This verifies that the untrusted pointer looks sane:
2324  * it is _not_ a guarantee that the pointer is actually
2325  * part of the slab cache in question, but it at least
2326  * validates that the pointer can be dereferenced and
2327  * looks half-way sane.
2328  *
2329  * Currently only used for dentry validation.
2330  */
2331 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2332 {
2333         unsigned long addr = (unsigned long) ptr;
2334         unsigned long min_addr = PAGE_OFFSET;
2335         unsigned long align_mask = BYTES_PER_WORD-1;
2336         unsigned long size = cachep->objsize;
2337         struct page *page;
2338
2339         if (unlikely(addr < min_addr))
2340                 goto out;
2341         if (unlikely(addr > (unsigned long)high_memory - size))
2342                 goto out;
2343         if (unlikely(addr & align_mask))
2344                 goto out;
2345         if (unlikely(!kern_addr_valid(addr)))
2346                 goto out;
2347         if (unlikely(!kern_addr_valid(addr + size - 1)))
2348                 goto out;
2349         page = virt_to_page(ptr);
2350         if (unlikely(!PageSlab(page)))
2351                 goto out;
2352         if (unlikely(GET_PAGE_CACHE(page) != cachep))
2353                 goto out;
2354         return 1;
2355 out:
2356         return 0;
2357 }
2358
2359 #ifdef CONFIG_NUMA
2360 /**
2361  * kmem_cache_alloc_node - Allocate an object on the specified node
2362  * @cachep: The cache to allocate from.
2363  * @flags: See kmalloc().
2364  * @nodeid: node number of the target node.
2365  *
2366  * Identical to kmem_cache_alloc, except that this function is slow
2367  * and can sleep. And it will allocate memory on the given node, which
2368  * can improve the performance for cpu bound structures.
2369  */
2370 void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2371 {
2372         int loop;
2373         void *objp;
2374         struct slab *slabp;
2375         kmem_bufctl_t next;
2376
2377         if (nodeid == -1)
2378                 return kmem_cache_alloc(cachep, flags);
2379
2380         for (loop = 0;;loop++) {
2381                 struct list_head *q;
2382
2383                 objp = NULL;
2384                 check_irq_on();
2385                 spin_lock_irq(&cachep->spinlock);
2386                 /* walk through all partial and empty slab and find one
2387                  * from the right node */
2388                 list_for_each(q,&cachep->lists.slabs_partial) {
2389                         slabp = list_entry(q, struct slab, list);
2390
2391                         if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2392                                         loop > 2)
2393                                 goto got_slabp;
2394                 }
2395                 list_for_each(q, &cachep->lists.slabs_free) {
2396                         slabp = list_entry(q, struct slab, list);
2397
2398                         if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2399                                         loop > 2)
2400                                 goto got_slabp;
2401                 }
2402                 spin_unlock_irq(&cachep->spinlock);
2403
2404                 local_irq_disable();
2405                 if (!cache_grow(cachep, flags, nodeid)) {
2406                         local_irq_enable();
2407                         return NULL;
2408                 }
2409                 local_irq_enable();
2410         }
2411 got_slabp:
2412         /* found one: allocate object */
2413         check_slabp(cachep, slabp);
2414         check_spinlock_acquired(cachep);
2415
2416         STATS_INC_ALLOCED(cachep);
2417         STATS_INC_ACTIVE(cachep);
2418         STATS_SET_HIGH(cachep);
2419         STATS_INC_NODEALLOCS(cachep);
2420
2421         objp = slabp->s_mem + slabp->free*cachep->objsize;
2422
2423         slabp->inuse++;
2424         next = slab_bufctl(slabp)[slabp->free];
2425 #if DEBUG
2426         slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2427 #endif
2428         slabp->free = next;
2429         check_slabp(cachep, slabp);
2430
2431         /* move slabp to correct slabp list: */
2432         list_del(&slabp->list);
2433         if (slabp->free == BUFCTL_END)
2434                 list_add(&slabp->list, &cachep->lists.slabs_full);
2435         else
2436                 list_add(&slabp->list, &cachep->lists.slabs_partial);
2437
2438         list3_data(cachep)->free_objects--;
2439         spin_unlock_irq(&cachep->spinlock);
2440
2441         objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
2442                                         __builtin_return_address(0));
2443         return objp;
2444 }
2445 EXPORT_SYMBOL(kmem_cache_alloc_node);
2446
2447 void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
2448 {
2449         kmem_cache_t *cachep;
2450
2451         cachep = kmem_find_general_cachep(size, flags);
2452         if (unlikely(cachep == NULL))
2453                 return NULL;
2454         return kmem_cache_alloc_node(cachep, flags, node);
2455 }
2456 EXPORT_SYMBOL(kmalloc_node);
2457 #endif
2458
2459 /**
2460  * kmalloc - allocate memory
2461  * @size: how many bytes of memory are required.
2462  * @flags: the type of memory to allocate.
2463  *
2464  * kmalloc is the normal method of allocating memory
2465  * in the kernel.
2466  *
2467  * The @flags argument may be one of:
2468  *
2469  * %GFP_USER - Allocate memory on behalf of user.  May sleep.
2470  *
2471  * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
2472  *
2473  * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
2474  *
2475  * Additionally, the %GFP_DMA flag may be set to indicate the memory
2476  * must be suitable for DMA.  This can mean different things on different
2477  * platforms.  For example, on i386, it means that the memory must come
2478  * from the first 16MB.
2479  */
2480 void *__kmalloc(size_t size, unsigned int __nocast flags)
2481 {
2482         kmem_cache_t *cachep;
2483
2484         /* If you want to save a few bytes .text space: replace
2485          * __ with kmem_.
2486          * Then kmalloc uses the uninlined functions instead of the inline
2487          * functions.
2488          */
2489         cachep = __find_general_cachep(size, flags);
2490         if (unlikely(cachep == NULL))
2491                 return NULL;
2492         return __cache_alloc(cachep, flags);
2493 }
2494 EXPORT_SYMBOL(__kmalloc);
2495
2496 #ifdef CONFIG_SMP
2497 /**
2498  * __alloc_percpu - allocate one copy of the object for every present
2499  * cpu in the system, zeroing them.
2500  * Objects should be dereferenced using the per_cpu_ptr macro only.
2501  *
2502  * @size: how many bytes of memory are required.
2503  * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2504  */
2505 void *__alloc_percpu(size_t size, size_t align)
2506 {
2507         int i;
2508         struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
2509
2510         if (!pdata)
2511                 return NULL;
2512
2513         for (i = 0; i < NR_CPUS; i++) {
2514                 if (!cpu_possible(i))
2515                         continue;
2516                 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
2517                                                 cpu_to_node(i));
2518
2519                 if (!pdata->ptrs[i])
2520                         goto unwind_oom;
2521                 memset(pdata->ptrs[i], 0, size);
2522         }
2523
2524         /* Catch derefs w/o wrappers */
2525         return (void *) (~(unsigned long) pdata);
2526
2527 unwind_oom:
2528         while (--i >= 0) {
2529                 if (!cpu_possible(i))
2530                         continue;
2531                 kfree(pdata->ptrs[i]);
2532         }
2533         kfree(pdata);
2534         return NULL;
2535 }
2536 EXPORT_SYMBOL(__alloc_percpu);
2537 #endif
2538
2539 /**
2540  * kmem_cache_free - Deallocate an object
2541  * @cachep: The cache the allocation was from.
2542  * @objp: The previously allocated object.
2543  *
2544  * Free an object which was previously allocated from this
2545  * cache.
2546  */
2547 void kmem_cache_free(kmem_cache_t *cachep, void *objp)
2548 {
2549         unsigned long flags;
2550
2551         local_irq_save(flags);
2552         __cache_free(cachep, objp);
2553         local_irq_restore(flags);
2554 }
2555 EXPORT_SYMBOL(kmem_cache_free);
2556
2557 /**
2558  * kcalloc - allocate memory for an array. The memory is set to zero.
2559  * @n: number of elements.
2560  * @size: element size.
2561  * @flags: the type of memory to allocate.
2562  */
2563 void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
2564 {
2565         void *ret = NULL;
2566
2567         if (n != 0 && size > INT_MAX / n)
2568                 return ret;
2569
2570         ret = kmalloc(n * size, flags);
2571         if (ret)
2572                 memset(ret, 0, n * size);
2573         return ret;
2574 }
2575 EXPORT_SYMBOL(kcalloc);
2576
2577 /**
2578  * kfree - free previously allocated memory
2579  * @objp: pointer returned by kmalloc.
2580  *
2581  * Don't free memory not originally allocated by kmalloc()
2582  * or you will run into trouble.
2583  */
2584 void kfree(const void *objp)
2585 {
2586         kmem_cache_t *c;
2587         unsigned long flags;
2588
2589         if (unlikely(!objp))
2590                 return;
2591         local_irq_save(flags);
2592         kfree_debugcheck(objp);
2593         c = GET_PAGE_CACHE(virt_to_page(objp));
2594         __cache_free(c, (void*)objp);
2595         local_irq_restore(flags);
2596 }
2597 EXPORT_SYMBOL(kfree);
2598
2599 #ifdef CONFIG_SMP
2600 /**
2601  * free_percpu - free previously allocated percpu memory
2602  * @objp: pointer returned by alloc_percpu.
2603  *
2604  * Don't free memory not originally allocated by alloc_percpu()
2605  * The complemented objp is to check for that.
2606  */
2607 void
2608 free_percpu(const void *objp)
2609 {
2610         int i;
2611         struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2612
2613         for (i = 0; i < NR_CPUS; i++) {
2614                 if (!cpu_possible(i))
2615                         continue;
2616                 kfree(p->ptrs[i]);
2617         }
2618         kfree(p);
2619 }
2620 EXPORT_SYMBOL(free_percpu);
2621 #endif
2622
2623 unsigned int kmem_cache_size(kmem_cache_t *cachep)
2624 {
2625         return obj_reallen(cachep);
2626 }
2627 EXPORT_SYMBOL(kmem_cache_size);
2628
2629 const char *kmem_cache_name(kmem_cache_t *cachep)
2630 {
2631         return cachep->name;
2632 }
2633 EXPORT_SYMBOL_GPL(kmem_cache_name);
2634
2635 struct ccupdate_struct {
2636         kmem_cache_t *cachep;
2637         struct array_cache *new[NR_CPUS];
2638 };
2639
2640 static void do_ccupdate_local(void *info)
2641 {
2642         struct ccupdate_struct *new = (struct ccupdate_struct *)info;
2643         struct array_cache *old;
2644
2645         check_irq_off();
2646         old = ac_data(new->cachep);
2647
2648         new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2649         new->new[smp_processor_id()] = old;
2650 }
2651
2652
2653 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
2654                                 int shared)
2655 {
2656         struct ccupdate_struct new;
2657         struct array_cache *new_shared;
2658         int i;
2659
2660         memset(&new.new,0,sizeof(new.new));
2661         for (i = 0; i < NR_CPUS; i++) {
2662                 if (cpu_online(i)) {
2663                         new.new[i] = alloc_arraycache(i, limit, batchcount);
2664                         if (!new.new[i]) {
2665                                 for (i--; i >= 0; i--) kfree(new.new[i]);
2666                                 return -ENOMEM;
2667                         }
2668                 } else {
2669                         new.new[i] = NULL;
2670                 }
2671         }
2672         new.cachep = cachep;
2673
2674         smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2675
2676         check_irq_on();
2677         spin_lock_irq(&cachep->spinlock);
2678         cachep->batchcount = batchcount;
2679         cachep->limit = limit;
2680         cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
2681         spin_unlock_irq(&cachep->spinlock);
2682
2683         for (i = 0; i < NR_CPUS; i++) {
2684                 struct array_cache *ccold = new.new[i];
2685                 if (!ccold)
2686                         continue;
2687                 spin_lock_irq(&cachep->spinlock);
2688                 free_block(cachep, ac_entry(ccold), ccold->avail);
2689                 spin_unlock_irq(&cachep->spinlock);
2690                 kfree(ccold);
2691         }
2692         new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
2693         if (new_shared) {
2694                 struct array_cache *old;
2695
2696                 spin_lock_irq(&cachep->spinlock);
2697                 old = cachep->lists.shared;
2698                 cachep->lists.shared = new_shared;
2699                 if (old)
2700                         free_block(cachep, ac_entry(old), old->avail);
2701                 spin_unlock_irq(&cachep->spinlock);
2702                 kfree(old);
2703         }
2704
2705         return 0;
2706 }
2707
2708
2709 static void enable_cpucache(kmem_cache_t *cachep)
2710 {
2711         int err;
2712         int limit, shared;
2713
2714         /* The head array serves three purposes:
2715          * - create a LIFO ordering, i.e. return objects that are cache-warm
2716          * - reduce the number of spinlock operations.
2717          * - reduce the number of linked list operations on the slab and
2718          *   bufctl chains: array operations are cheaper.
2719          * The numbers are guessed, we should auto-tune as described by
2720          * Bonwick.
2721          */
2722         if (cachep->objsize > 131072)
2723                 limit = 1;
2724         else if (cachep->objsize > PAGE_SIZE)
2725                 limit = 8;
2726         else if (cachep->objsize > 1024)
2727                 limit = 24;
2728         else if (cachep->objsize > 256)
2729                 limit = 54;
2730         else
2731                 limit = 120;
2732
2733         /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
2734          * allocation behaviour: Most allocs on one cpu, most free operations
2735          * on another cpu. For these cases, an efficient object passing between
2736          * cpus is necessary. This is provided by a shared array. The array
2737          * replaces Bonwick's magazine layer.
2738          * On uniprocessor, it's functionally equivalent (but less efficient)
2739          * to a larger limit. Thus disabled by default.
2740          */
2741         shared = 0;
2742 #ifdef CONFIG_SMP
2743         if (cachep->objsize <= PAGE_SIZE)
2744                 shared = 8;
2745 #endif
2746
2747 #if DEBUG
2748         /* With debugging enabled, large batchcount lead to excessively
2749          * long periods with disabled local interrupts. Limit the
2750          * batchcount
2751          */
2752         if (limit > 32)
2753                 limit = 32;
2754 #endif
2755         err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
2756         if (err)
2757                 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
2758                                         cachep->name, -err);
2759 }
2760
2761 static void drain_array_locked(kmem_cache_t *cachep,
2762                                 struct array_cache *ac, int force)
2763 {
2764         int tofree;
2765
2766         check_spinlock_acquired(cachep);
2767         if (ac->touched && !force) {
2768                 ac->touched = 0;
2769         } else if (ac->avail) {
2770                 tofree = force ? ac->avail : (ac->limit+4)/5;
2771                 if (tofree > ac->avail) {
2772                         tofree = (ac->avail+1)/2;
2773                 }
2774                 free_block(cachep, ac_entry(ac), tofree);
2775                 ac->avail -= tofree;
2776                 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
2777                                         sizeof(void*)*ac->avail);
2778         }
2779 }
2780
2781 /**
2782  * cache_reap - Reclaim memory from caches.
2783  *
2784  * Called from workqueue/eventd every few seconds.
2785  * Purpose:
2786  * - clear the per-cpu caches for this CPU.
2787  * - return freeable pages to the main free memory pool.
2788  *
2789  * If we cannot acquire the cache chain semaphore then just give up - we'll
2790  * try again on the next iteration.
2791  */
2792 static void cache_reap(void *unused)
2793 {
2794         struct list_head *walk;
2795
2796         if (down_trylock(&cache_chain_sem)) {
2797                 /* Give up. Setup the next iteration. */
2798                 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
2799                 return;
2800         }
2801
2802         list_for_each(walk, &cache_chain) {
2803                 kmem_cache_t *searchp;
2804                 struct list_head* p;
2805                 int tofree;
2806                 struct slab *slabp;
2807
2808                 searchp = list_entry(walk, kmem_cache_t, next);
2809
2810                 if (searchp->flags & SLAB_NO_REAP)
2811                         goto next;
2812
2813                 check_irq_on();
2814
2815                 spin_lock_irq(&searchp->spinlock);
2816
2817                 drain_array_locked(searchp, ac_data(searchp), 0);
2818
2819                 if(time_after(searchp->lists.next_reap, jiffies))
2820                         goto next_unlock;
2821
2822                 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
2823
2824                 if (searchp->lists.shared)
2825                         drain_array_locked(searchp, searchp->lists.shared, 0);
2826
2827                 if (searchp->lists.free_touched) {
2828                         searchp->lists.free_touched = 0;
2829                         goto next_unlock;
2830                 }
2831
2832                 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
2833                 do {
2834                         p = list3_data(searchp)->slabs_free.next;
2835                         if (p == &(list3_data(searchp)->slabs_free))
2836                                 break;
2837
2838                         slabp = list_entry(p, struct slab, list);
2839                         BUG_ON(slabp->inuse);
2840                         list_del(&slabp->list);
2841                         STATS_INC_REAPED(searchp);
2842
2843                         /* Safe to drop the lock. The slab is no longer
2844                          * linked to the cache.
2845                          * searchp cannot disappear, we hold
2846                          * cache_chain_lock
2847                          */
2848                         searchp->lists.free_objects -= searchp->num;
2849                         spin_unlock_irq(&searchp->spinlock);
2850                         slab_destroy(searchp, slabp);
2851                         spin_lock_irq(&searchp->spinlock);
2852                 } while(--tofree > 0);
2853 next_unlock:
2854                 spin_unlock_irq(&searchp->spinlock);
2855 next:
2856                 cond_resched();
2857         }
2858         check_irq_on();
2859         up(&cache_chain_sem);
2860         drain_remote_pages();
2861         /* Setup the next iteration */
2862         schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
2863 }
2864
2865 #ifdef CONFIG_PROC_FS
2866
2867 static void *s_start(struct seq_file *m, loff_t *pos)
2868 {
2869         loff_t n = *pos;
2870         struct list_head *p;
2871
2872         down(&cache_chain_sem);
2873         if (!n) {
2874                 /*
2875                  * Output format version, so at least we can change it
2876                  * without _too_ many complaints.
2877                  */
2878 #if STATS
2879                 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
2880 #else
2881                 seq_puts(m, "slabinfo - version: 2.1\n");
2882 #endif
2883                 seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
2884                 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
2885                 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
2886 #if STATS
2887                 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
2888                                 " <error> <maxfreeable> <freelimit> <nodeallocs>");
2889                 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
2890 #endif
2891                 seq_putc(m, '\n');
2892         }
2893         p = cache_chain.next;
2894         while (n--) {
2895                 p = p->next;
2896                 if (p == &cache_chain)
2897                         return NULL;
2898         }
2899         return list_entry(p, kmem_cache_t, next);
2900 }
2901
2902 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2903 {
2904         kmem_cache_t *cachep = p;
2905         ++*pos;
2906         return cachep->next.next == &cache_chain ? NULL
2907                 : list_entry(cachep->next.next, kmem_cache_t, next);
2908 }
2909
2910 static void s_stop(struct seq_file *m, void *p)
2911 {
2912         up(&cache_chain_sem);
2913 }
2914
2915 static int s_show(struct seq_file *m, void *p)
2916 {
2917         kmem_cache_t *cachep = p;
2918         struct list_head *q;
2919         struct slab     *slabp;
2920         unsigned long   active_objs;
2921         unsigned long   num_objs;
2922         unsigned long   active_slabs = 0;
2923         unsigned long   num_slabs;
2924         const char *name;
2925         char *error = NULL;
2926
2927         check_irq_on();
2928         spin_lock_irq(&cachep->spinlock);
2929         active_objs = 0;
2930         num_slabs = 0;
2931         list_for_each(q,&cachep->lists.slabs_full) {
2932                 slabp = list_entry(q, struct slab, list);
2933                 if (slabp->inuse != cachep->num && !error)
2934                         error = "slabs_full accounting error";
2935                 active_objs += cachep->num;
2936                 active_slabs++;
2937         }
2938         list_for_each(q,&cachep->lists.slabs_partial) {
2939                 slabp = list_entry(q, struct slab, list);
2940                 if (slabp->inuse == cachep->num && !error)
2941                         error = "slabs_partial inuse accounting error";
2942                 if (!slabp->inuse && !error)
2943                         error = "slabs_partial/inuse accounting error";
2944                 active_objs += slabp->inuse;
2945                 active_slabs++;
2946         }
2947         list_for_each(q,&cachep->lists.slabs_free) {
2948                 slabp = list_entry(q, struct slab, list);
2949                 if (slabp->inuse && !error)
2950                         error = "slabs_free/inuse accounting error";
2951                 num_slabs++;
2952         }
2953         num_slabs+=active_slabs;
2954         num_objs = num_slabs*cachep->num;
2955         if (num_objs - active_objs != cachep->lists.free_objects && !error)
2956                 error = "free_objects accounting error";
2957
2958         name = cachep->name;
2959         if (error)
2960                 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
2961
2962         seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
2963                 name, active_objs, num_objs, cachep->objsize,
2964                 cachep->num, (1<<cachep->gfporder));
2965         seq_printf(m, " : tunables %4u %4u %4u",
2966                         cachep->limit, cachep->batchcount,
2967                         cachep->lists.shared->limit/cachep->batchcount);
2968         seq_printf(m, " : slabdata %6lu %6lu %6u",
2969                         active_slabs, num_slabs, cachep->lists.shared->avail);
2970 #if STATS
2971         {       /* list3 stats */
2972                 unsigned long high = cachep->high_mark;
2973                 unsigned long allocs = cachep->num_allocations;
2974                 unsigned long grown = cachep->grown;
2975                 unsigned long reaped = cachep->reaped;
2976                 unsigned long errors = cachep->errors;
2977                 unsigned long max_freeable = cachep->max_freeable;
2978                 unsigned long free_limit = cachep->free_limit;
2979                 unsigned long node_allocs = cachep->node_allocs;
2980
2981                 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
2982                                 allocs, high, grown, reaped, errors,
2983                                 max_freeable, free_limit, node_allocs);
2984         }
2985         /* cpu stats */
2986         {
2987                 unsigned long allochit = atomic_read(&cachep->allochit);
2988                 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
2989                 unsigned long freehit = atomic_read(&cachep->freehit);
2990                 unsigned long freemiss = atomic_read(&cachep->freemiss);
2991
2992                 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
2993                         allochit, allocmiss, freehit, freemiss);
2994         }
2995 #endif
2996         seq_putc(m, '\n');
2997         spin_unlock_irq(&cachep->spinlock);
2998         return 0;
2999 }
3000
3001 /*
3002  * slabinfo_op - iterator that generates /proc/slabinfo
3003  *
3004  * Output layout:
3005  * cache-name
3006  * num-active-objs
3007  * total-objs
3008  * object size
3009  * num-active-slabs
3010  * total-slabs
3011  * num-pages-per-slab
3012  * + further values on SMP and with statistics enabled
3013  */
3014
3015 struct seq_operations slabinfo_op = {
3016         .start  = s_start,
3017         .next   = s_next,
3018         .stop   = s_stop,
3019         .show   = s_show,
3020 };
3021
3022 #define MAX_SLABINFO_WRITE 128
3023 /**
3024  * slabinfo_write - Tuning for the slab allocator
3025  * @file: unused
3026  * @buffer: user buffer
3027  * @count: data length
3028  * @ppos: unused
3029  */
3030 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3031                                 size_t count, loff_t *ppos)
3032 {
3033         char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
3034         int limit, batchcount, shared, res;
3035         struct list_head *p;
3036
3037         if (count > MAX_SLABINFO_WRITE)
3038                 return -EINVAL;
3039         if (copy_from_user(&kbuf, buffer, count))
3040                 return -EFAULT;
3041         kbuf[MAX_SLABINFO_WRITE] = '\0';
3042
3043         tmp = strchr(kbuf, ' ');
3044         if (!tmp)
3045                 return -EINVAL;
3046         *tmp = '\0';
3047         tmp++;
3048         if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3049                 return -EINVAL;
3050
3051         /* Find the cache in the chain of caches. */
3052         down(&cache_chain_sem);
3053         res = -EINVAL;
3054         list_for_each(p,&cache_chain) {
3055                 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3056
3057                 if (!strcmp(cachep->name, kbuf)) {
3058                         if (limit < 1 ||
3059                             batchcount < 1 ||
3060                             batchcount > limit ||
3061                             shared < 0) {
3062                                 res = -EINVAL;
3063                         } else {
3064                                 res = do_tune_cpucache(cachep, limit, batchcount, shared);
3065                         }
3066                         break;
3067                 }
3068         }
3069         up(&cache_chain_sem);
3070         if (res >= 0)
3071                 res = count;
3072         return res;
3073 }
3074 #endif
3075
3076 unsigned int ksize(const void *objp)
3077 {
3078         kmem_cache_t *c;
3079         unsigned long flags;
3080         unsigned int size = 0;
3081
3082         if (likely(objp != NULL)) {
3083                 local_irq_save(flags);
3084                 c = GET_PAGE_CACHE(virt_to_page(objp));
3085                 size = kmem_cache_size(c);
3086                 local_irq_restore(flags);
3087         }
3088
3089         return size;
3090 }
3091
3092
3093 /*
3094  * kstrdup - allocate space for and copy an existing string
3095  *
3096  * @s: the string to duplicate
3097  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3098  */
3099 char *kstrdup(const char *s, unsigned int __nocast gfp)
3100 {
3101         size_t len;
3102         char *buf;
3103
3104         if (!s)
3105                 return NULL;
3106
3107         len = strlen(s) + 1;
3108         buf = kmalloc(len, gfp);
3109         if (buf)
3110                 memcpy(buf, s, len);
3111         return buf;
3112 }
3113 EXPORT_SYMBOL(kstrdup);