2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists.
5 * The allocator synchronizes using per slab locks or atomic operatios
6 * and only uses a centralized lock to manage a pool of partial slabs.
8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
13 #include <linux/swap.h> /* struct reclaim_state */
14 #include <linux/module.h>
15 #include <linux/bit_spinlock.h>
16 #include <linux/interrupt.h>
17 #include <linux/bitops.h>
18 #include <linux/slab.h>
19 #include <linux/proc_fs.h>
20 #include <linux/seq_file.h>
21 #include <linux/kmemcheck.h>
22 #include <linux/cpu.h>
23 #include <linux/cpuset.h>
24 #include <linux/mempolicy.h>
25 #include <linux/ctype.h>
26 #include <linux/debugobjects.h>
27 #include <linux/kallsyms.h>
28 #include <linux/memory.h>
29 #include <linux/math64.h>
30 #include <linux/fault-inject.h>
31 #include <linux/stacktrace.h>
33 #include <trace/events/kmem.h>
37 * 1. slub_lock (Global Semaphore)
39 * 3. slab_lock(page) (Only on some arches and for debugging)
43 * The role of the slub_lock is to protect the list of all the slabs
44 * and to synchronize major metadata changes to slab cache structures.
46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
59 * The list_lock protects the partial and full list on each node and
60 * the partial slab counter. If taken then no new slabs may be added or
61 * removed from the lists nor make the number of partial slabs be modified.
62 * (Note that the total number of slabs is an atomic value that may be
63 * modified without taking the list lock).
65 * The list_lock is a centralized lock and thus we avoid taking it as
66 * much as possible. As long as SLUB does not have to handle partial
67 * slabs, operations can continue without any centralized lock. F.e.
68 * allocating a long series of objects that fill up slabs does not require
70 * Interrupts are disabled during allocation and deallocation in order to
71 * make the slab allocator safe to use in the context of an irq. In addition
72 * interrupts are disabled to ensure that the processor does not change
73 * while handling per_cpu slabs, due to kernel preemption.
75 * SLUB assigns one slab for allocation to each processor.
76 * Allocations only occur from these slabs called cpu slabs.
78 * Slabs with free elements are kept on a partial list and during regular
79 * operations no list for full slabs is used. If an object in a full slab is
80 * freed then the slab will show up again on the partial lists.
81 * We track full slabs for debugging purposes though because otherwise we
82 * cannot scan all objects.
84 * Slabs are freed when they become empty. Teardown and setup is
85 * minimal so we rely on the page allocators per cpu caches for
86 * fast frees and allocs.
88 * Overloading of page flags that are otherwise used for LRU management.
90 * PageActive The slab is frozen and exempt from list processing.
91 * This means that the slab is dedicated to a purpose
92 * such as satisfying allocations for a specific
93 * processor. Objects may be freed in the slab while
94 * it is frozen but slab_free will then skip the usual
95 * list operations. It is up to the processor holding
96 * the slab to integrate the slab into the slab lists
97 * when the slab is no longer needed.
99 * One use of this flag is to mark slabs that are
100 * used for allocations. Then such a slab becomes a cpu
101 * slab. The cpu slab may be equipped with an additional
102 * freelist that allows lockless access to
103 * free objects in addition to the regular freelist
104 * that requires the slab lock.
106 * PageError Slab requires special handling due to debug
107 * options set. This moves slab handling out of
108 * the fast path and disables lockless freelists.
111 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
112 SLAB_TRACE | SLAB_DEBUG_FREE)
114 static inline int kmem_cache_debug(struct kmem_cache *s)
116 #ifdef CONFIG_SLUB_DEBUG
117 return unlikely(s->flags & SLAB_DEBUG_FLAGS);
124 * Issues still to be resolved:
126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
128 * - Variable sizing of the per node arrays
131 /* Enable to test recovery from slab corruption on boot */
132 #undef SLUB_RESILIENCY_TEST
134 /* Enable to log cmpxchg failures */
135 #undef SLUB_DEBUG_CMPXCHG
138 * Mininum number of partial slabs. These will be left on the partial
139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
141 #define MIN_PARTIAL 5
144 * Maximum number of desirable partial slabs.
145 * The existence of more partial slabs makes kmem_cache_shrink
146 * sort the partial list by the number of objects in the.
148 #define MAX_PARTIAL 10
150 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
151 SLAB_POISON | SLAB_STORE_USER)
154 * Debugging flags that require metadata to be stored in the slab. These get
155 * disabled when slub_debug=O is used and a cache's min order increases with
158 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
161 * Set of flags that will prevent slab merging
163 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
164 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
167 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
168 SLAB_CACHE_DMA | SLAB_NOTRACK)
171 #define OO_MASK ((1 << OO_SHIFT) - 1)
172 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
174 /* Internal SLUB flags */
175 #define __OBJECT_POISON 0x80000000UL /* Poison object */
176 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
178 static int kmem_size = sizeof(struct kmem_cache);
181 static struct notifier_block slab_notifier;
185 DOWN, /* No slab functionality available */
186 PARTIAL, /* Kmem_cache_node works */
187 UP, /* Everything works but does not show up in sysfs */
191 /* A list of all slab caches on the system */
192 static DECLARE_RWSEM(slub_lock);
193 static LIST_HEAD(slab_caches);
196 * Tracking user of a slab.
198 #define TRACK_ADDRS_COUNT 16
200 unsigned long addr; /* Called from address */
201 #ifdef CONFIG_STACKTRACE
202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
204 int cpu; /* Was running on cpu */
205 int pid; /* Pid context */
206 unsigned long when; /* When did the operation occur */
209 enum track_item { TRACK_ALLOC, TRACK_FREE };
212 static int sysfs_slab_add(struct kmem_cache *);
213 static int sysfs_slab_alias(struct kmem_cache *, const char *);
214 static void sysfs_slab_remove(struct kmem_cache *);
217 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
218 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
220 static inline void sysfs_slab_remove(struct kmem_cache *s)
228 static inline void stat(const struct kmem_cache *s, enum stat_item si)
230 #ifdef CONFIG_SLUB_STATS
231 __this_cpu_inc(s->cpu_slab->stat[si]);
235 /********************************************************************
236 * Core slab cache functions
237 *******************************************************************/
239 int slab_is_available(void)
241 return slab_state >= UP;
244 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
246 return s->node[node];
249 /* Verify that a pointer has an address that is valid within a slab page */
250 static inline int check_valid_pointer(struct kmem_cache *s,
251 struct page *page, const void *object)
258 base = page_address(page);
259 if (object < base || object >= base + page->objects * s->size ||
260 (object - base) % s->size) {
267 static inline void *get_freepointer(struct kmem_cache *s, void *object)
269 return *(void **)(object + s->offset);
272 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
276 #ifdef CONFIG_DEBUG_PAGEALLOC
277 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
279 p = get_freepointer(s, object);
284 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
286 *(void **)(object + s->offset) = fp;
289 /* Loop over all objects in a slab */
290 #define for_each_object(__p, __s, __addr, __objects) \
291 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
294 /* Determine object index from a given position */
295 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
297 return (p - addr) / s->size;
300 static inline size_t slab_ksize(const struct kmem_cache *s)
302 #ifdef CONFIG_SLUB_DEBUG
304 * Debugging requires use of the padding between object
305 * and whatever may come after it.
307 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
312 * If we have the need to store the freelist pointer
313 * back there or track user information then we can
314 * only use the space before that information.
316 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
319 * Else we can use all the padding etc for the allocation
324 static inline int order_objects(int order, unsigned long size, int reserved)
326 return ((PAGE_SIZE << order) - reserved) / size;
329 static inline struct kmem_cache_order_objects oo_make(int order,
330 unsigned long size, int reserved)
332 struct kmem_cache_order_objects x = {
333 (order << OO_SHIFT) + order_objects(order, size, reserved)
339 static inline int oo_order(struct kmem_cache_order_objects x)
341 return x.x >> OO_SHIFT;
344 static inline int oo_objects(struct kmem_cache_order_objects x)
346 return x.x & OO_MASK;
350 * Per slab locking using the pagelock
352 static __always_inline void slab_lock(struct page *page)
354 bit_spin_lock(PG_locked, &page->flags);
357 static __always_inline void slab_unlock(struct page *page)
359 __bit_spin_unlock(PG_locked, &page->flags);
362 /* Interrupts must be disabled (for the fallback code to work right) */
363 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
368 VM_BUG_ON(!irqs_disabled());
369 #ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
389 stat(s, CMPXCHG_DOUBLE_FAIL);
391 #ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
398 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
403 #ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
414 local_irq_save(flags);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
420 local_irq_restore(flags);
424 local_irq_restore(flags);
428 stat(s, CMPXCHG_DOUBLE_FAIL);
430 #ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
437 #ifdef CONFIG_SLUB_DEBUG
439 * Determine a map of object in use on a page.
441 * Node listlock must be held to guarantee that the page does
442 * not vanish from under us.
444 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
447 void *addr = page_address(page);
449 for (p = page->freelist; p; p = get_freepointer(s, p))
450 set_bit(slab_index(p, s, addr), map);
456 #ifdef CONFIG_SLUB_DEBUG_ON
457 static int slub_debug = DEBUG_DEFAULT_FLAGS;
459 static int slub_debug;
462 static char *slub_debug_slabs;
463 static int disable_higher_order_debug;
468 static void print_section(char *text, u8 *addr, unsigned int length)
470 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
474 static struct track *get_track(struct kmem_cache *s, void *object,
475 enum track_item alloc)
480 p = object + s->offset + sizeof(void *);
482 p = object + s->inuse;
487 static void set_track(struct kmem_cache *s, void *object,
488 enum track_item alloc, unsigned long addr)
490 struct track *p = get_track(s, object, alloc);
493 #ifdef CONFIG_STACKTRACE
494 struct stack_trace trace;
497 trace.nr_entries = 0;
498 trace.max_entries = TRACK_ADDRS_COUNT;
499 trace.entries = p->addrs;
501 save_stack_trace(&trace);
503 /* See rant in lockdep.c */
504 if (trace.nr_entries != 0 &&
505 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
508 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
512 p->cpu = smp_processor_id();
513 p->pid = current->pid;
516 memset(p, 0, sizeof(struct track));
519 static void init_tracking(struct kmem_cache *s, void *object)
521 if (!(s->flags & SLAB_STORE_USER))
524 set_track(s, object, TRACK_FREE, 0UL);
525 set_track(s, object, TRACK_ALLOC, 0UL);
528 static void print_track(const char *s, struct track *t)
533 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
534 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
535 #ifdef CONFIG_STACKTRACE
538 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
540 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
547 static void print_tracking(struct kmem_cache *s, void *object)
549 if (!(s->flags & SLAB_STORE_USER))
552 print_track("Allocated", get_track(s, object, TRACK_ALLOC));
553 print_track("Freed", get_track(s, object, TRACK_FREE));
556 static void print_page_info(struct page *page)
558 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
559 page, page->objects, page->inuse, page->freelist, page->flags);
563 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
569 vsnprintf(buf, sizeof(buf), fmt, args);
571 printk(KERN_ERR "========================================"
572 "=====================================\n");
573 printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
574 printk(KERN_ERR "----------------------------------------"
575 "-------------------------------------\n\n");
578 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
584 vsnprintf(buf, sizeof(buf), fmt, args);
586 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
589 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
591 unsigned int off; /* Offset of last byte */
592 u8 *addr = page_address(page);
594 print_tracking(s, p);
596 print_page_info(page);
598 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
599 p, p - addr, get_freepointer(s, p));
602 print_section("Bytes b4 ", p - 16, 16);
604 print_section("Object ", p, min_t(unsigned long, s->objsize,
606 if (s->flags & SLAB_RED_ZONE)
607 print_section("Redzone ", p + s->objsize,
608 s->inuse - s->objsize);
611 off = s->offset + sizeof(void *);
615 if (s->flags & SLAB_STORE_USER)
616 off += 2 * sizeof(struct track);
619 /* Beginning of the filler is the free pointer */
620 print_section("Padding ", p + off, s->size - off);
625 static void object_err(struct kmem_cache *s, struct page *page,
626 u8 *object, char *reason)
628 slab_bug(s, "%s", reason);
629 print_trailer(s, page, object);
632 static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
638 vsnprintf(buf, sizeof(buf), fmt, args);
640 slab_bug(s, "%s", buf);
641 print_page_info(page);
645 static void init_object(struct kmem_cache *s, void *object, u8 val)
649 if (s->flags & __OBJECT_POISON) {
650 memset(p, POISON_FREE, s->objsize - 1);
651 p[s->objsize - 1] = POISON_END;
654 if (s->flags & SLAB_RED_ZONE)
655 memset(p + s->objsize, val, s->inuse - s->objsize);
658 static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
669 static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
672 unsigned int words, prefix;
675 return check_bytes8(start, value, bytes);
677 value64 = value | value << 8 | value << 16 | value << 24;
678 value64 = (value64 & 0xffffffff) | value64 << 32;
679 prefix = 8 - ((unsigned long)start) % 8;
682 u8 *r = check_bytes8(start, value, prefix);
692 if (*(u64 *)start != value64)
693 return check_bytes8(start, value, 8);
698 return check_bytes8(start, value, bytes % 8);
701 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
702 void *from, void *to)
704 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
705 memset(from, data, to - from);
708 static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
709 u8 *object, char *what,
710 u8 *start, unsigned int value, unsigned int bytes)
715 fault = check_bytes(start, value, bytes);
720 while (end > fault && end[-1] == value)
723 slab_bug(s, "%s overwritten", what);
724 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
725 fault, end - 1, fault[0], value);
726 print_trailer(s, page, object);
728 restore_bytes(s, what, value, fault, end);
736 * Bytes of the object to be managed.
737 * If the freepointer may overlay the object then the free
738 * pointer is the first word of the object.
740 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
743 * object + s->objsize
744 * Padding to reach word boundary. This is also used for Redzoning.
745 * Padding is extended by another word if Redzoning is enabled and
748 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
749 * 0xcc (RED_ACTIVE) for objects in use.
752 * Meta data starts here.
754 * A. Free pointer (if we cannot overwrite object on free)
755 * B. Tracking data for SLAB_STORE_USER
756 * C. Padding to reach required alignment boundary or at mininum
757 * one word if debugging is on to be able to detect writes
758 * before the word boundary.
760 * Padding is done using 0x5a (POISON_INUSE)
763 * Nothing is used beyond s->size.
765 * If slabcaches are merged then the objsize and inuse boundaries are mostly
766 * ignored. And therefore no slab options that rely on these boundaries
767 * may be used with merged slabcaches.
770 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
772 unsigned long off = s->inuse; /* The end of info */
775 /* Freepointer is placed after the object. */
776 off += sizeof(void *);
778 if (s->flags & SLAB_STORE_USER)
779 /* We also have user information there */
780 off += 2 * sizeof(struct track);
785 return check_bytes_and_report(s, page, p, "Object padding",
786 p + off, POISON_INUSE, s->size - off);
789 /* Check the pad bytes at the end of a slab page */
790 static int slab_pad_check(struct kmem_cache *s, struct page *page)
798 if (!(s->flags & SLAB_POISON))
801 start = page_address(page);
802 length = (PAGE_SIZE << compound_order(page)) - s->reserved;
803 end = start + length;
804 remainder = length % s->size;
808 fault = check_bytes(end - remainder, POISON_INUSE, remainder);
811 while (end > fault && end[-1] == POISON_INUSE)
814 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
815 print_section("Padding ", end - remainder, remainder);
817 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
821 static int check_object(struct kmem_cache *s, struct page *page,
822 void *object, u8 val)
825 u8 *endobject = object + s->objsize;
827 if (s->flags & SLAB_RED_ZONE) {
828 if (!check_bytes_and_report(s, page, object, "Redzone",
829 endobject, val, s->inuse - s->objsize))
832 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
833 check_bytes_and_report(s, page, p, "Alignment padding",
834 endobject, POISON_INUSE, s->inuse - s->objsize);
838 if (s->flags & SLAB_POISON) {
839 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
840 (!check_bytes_and_report(s, page, p, "Poison", p,
841 POISON_FREE, s->objsize - 1) ||
842 !check_bytes_and_report(s, page, p, "Poison",
843 p + s->objsize - 1, POISON_END, 1)))
846 * check_pad_bytes cleans up on its own.
848 check_pad_bytes(s, page, p);
851 if (!s->offset && val == SLUB_RED_ACTIVE)
853 * Object and freepointer overlap. Cannot check
854 * freepointer while object is allocated.
858 /* Check free pointer validity */
859 if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
860 object_err(s, page, p, "Freepointer corrupt");
862 * No choice but to zap it and thus lose the remainder
863 * of the free objects in this slab. May cause
864 * another error because the object count is now wrong.
866 set_freepointer(s, p, NULL);
872 static int check_slab(struct kmem_cache *s, struct page *page)
876 VM_BUG_ON(!irqs_disabled());
878 if (!PageSlab(page)) {
879 slab_err(s, page, "Not a valid slab page");
883 maxobj = order_objects(compound_order(page), s->size, s->reserved);
884 if (page->objects > maxobj) {
885 slab_err(s, page, "objects %u > max %u",
886 s->name, page->objects, maxobj);
889 if (page->inuse > page->objects) {
890 slab_err(s, page, "inuse %u > max %u",
891 s->name, page->inuse, page->objects);
894 /* Slab_pad_check fixes things up after itself */
895 slab_pad_check(s, page);
900 * Determine if a certain object on a page is on the freelist. Must hold the
901 * slab lock to guarantee that the chains are in a consistent state.
903 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
908 unsigned long max_objects;
911 while (fp && nr <= page->objects) {
914 if (!check_valid_pointer(s, page, fp)) {
916 object_err(s, page, object,
917 "Freechain corrupt");
918 set_freepointer(s, object, NULL);
921 slab_err(s, page, "Freepointer corrupt");
922 page->freelist = NULL;
923 page->inuse = page->objects;
924 slab_fix(s, "Freelist cleared");
930 fp = get_freepointer(s, object);
934 max_objects = order_objects(compound_order(page), s->size, s->reserved);
935 if (max_objects > MAX_OBJS_PER_PAGE)
936 max_objects = MAX_OBJS_PER_PAGE;
938 if (page->objects != max_objects) {
939 slab_err(s, page, "Wrong number of objects. Found %d but "
940 "should be %d", page->objects, max_objects);
941 page->objects = max_objects;
942 slab_fix(s, "Number of objects adjusted.");
944 if (page->inuse != page->objects - nr) {
945 slab_err(s, page, "Wrong object count. Counter is %d but "
946 "counted were %d", page->inuse, page->objects - nr);
947 page->inuse = page->objects - nr;
948 slab_fix(s, "Object count adjusted.");
950 return search == NULL;
953 static void trace(struct kmem_cache *s, struct page *page, void *object,
956 if (s->flags & SLAB_TRACE) {
957 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
959 alloc ? "alloc" : "free",
964 print_section("Object ", (void *)object, s->objsize);
971 * Hooks for other subsystems that check memory allocations. In a typical
972 * production configuration these hooks all should produce no code at all.
974 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
976 flags &= gfp_allowed_mask;
977 lockdep_trace_alloc(flags);
978 might_sleep_if(flags & __GFP_WAIT);
980 return should_failslab(s->objsize, flags, s->flags);
983 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
985 flags &= gfp_allowed_mask;
986 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
987 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
990 static inline void slab_free_hook(struct kmem_cache *s, void *x)
992 kmemleak_free_recursive(x, s->flags);
995 * Trouble is that we may no longer disable interupts in the fast path
996 * So in order to make the debug calls that expect irqs to be
997 * disabled we need to disable interrupts temporarily.
999 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
1001 unsigned long flags;
1003 local_irq_save(flags);
1004 kmemcheck_slab_free(s, x, s->objsize);
1005 debug_check_no_locks_freed(x, s->objsize);
1006 local_irq_restore(flags);
1009 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1010 debug_check_no_obj_freed(x, s->objsize);
1014 * Tracking of fully allocated slabs for debugging purposes.
1016 * list_lock must be held.
1018 static void add_full(struct kmem_cache *s,
1019 struct kmem_cache_node *n, struct page *page)
1021 if (!(s->flags & SLAB_STORE_USER))
1024 list_add(&page->lru, &n->full);
1028 * list_lock must be held.
1030 static void remove_full(struct kmem_cache *s, struct page *page)
1032 if (!(s->flags & SLAB_STORE_USER))
1035 list_del(&page->lru);
1038 /* Tracking of the number of slabs for debugging purposes */
1039 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1041 struct kmem_cache_node *n = get_node(s, node);
1043 return atomic_long_read(&n->nr_slabs);
1046 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1048 return atomic_long_read(&n->nr_slabs);
1051 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1053 struct kmem_cache_node *n = get_node(s, node);
1056 * May be called early in order to allocate a slab for the
1057 * kmem_cache_node structure. Solve the chicken-egg
1058 * dilemma by deferring the increment of the count during
1059 * bootstrap (see early_kmem_cache_node_alloc).
1062 atomic_long_inc(&n->nr_slabs);
1063 atomic_long_add(objects, &n->total_objects);
1066 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1068 struct kmem_cache_node *n = get_node(s, node);
1070 atomic_long_dec(&n->nr_slabs);
1071 atomic_long_sub(objects, &n->total_objects);
1074 /* Object debug checks for alloc/free paths */
1075 static void setup_object_debug(struct kmem_cache *s, struct page *page,
1078 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1081 init_object(s, object, SLUB_RED_INACTIVE);
1082 init_tracking(s, object);
1085 static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
1086 void *object, unsigned long addr)
1088 if (!check_slab(s, page))
1091 if (!check_valid_pointer(s, page, object)) {
1092 object_err(s, page, object, "Freelist Pointer check fails");
1096 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1099 /* Success perform special debug activities for allocs */
1100 if (s->flags & SLAB_STORE_USER)
1101 set_track(s, object, TRACK_ALLOC, addr);
1102 trace(s, page, object, 1);
1103 init_object(s, object, SLUB_RED_ACTIVE);
1107 if (PageSlab(page)) {
1109 * If this is a slab page then lets do the best we can
1110 * to avoid issues in the future. Marking all objects
1111 * as used avoids touching the remaining objects.
1113 slab_fix(s, "Marking all objects used");
1114 page->inuse = page->objects;
1115 page->freelist = NULL;
1120 static noinline int free_debug_processing(struct kmem_cache *s,
1121 struct page *page, void *object, unsigned long addr)
1123 unsigned long flags;
1126 local_irq_save(flags);
1129 if (!check_slab(s, page))
1132 if (!check_valid_pointer(s, page, object)) {
1133 slab_err(s, page, "Invalid object pointer 0x%p", object);
1137 if (on_freelist(s, page, object)) {
1138 object_err(s, page, object, "Object already free");
1142 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1145 if (unlikely(s != page->slab)) {
1146 if (!PageSlab(page)) {
1147 slab_err(s, page, "Attempt to free object(0x%p) "
1148 "outside of slab", object);
1149 } else if (!page->slab) {
1151 "SLUB <none>: no slab for object 0x%p.\n",
1155 object_err(s, page, object,
1156 "page slab pointer corrupt.");
1160 if (s->flags & SLAB_STORE_USER)
1161 set_track(s, object, TRACK_FREE, addr);
1162 trace(s, page, object, 0);
1163 init_object(s, object, SLUB_RED_INACTIVE);
1167 local_irq_restore(flags);
1171 slab_fix(s, "Object at 0x%p not freed", object);
1175 static int __init setup_slub_debug(char *str)
1177 slub_debug = DEBUG_DEFAULT_FLAGS;
1178 if (*str++ != '=' || !*str)
1180 * No options specified. Switch on full debugging.
1186 * No options but restriction on slabs. This means full
1187 * debugging for slabs matching a pattern.
1191 if (tolower(*str) == 'o') {
1193 * Avoid enabling debugging on caches if its minimum order
1194 * would increase as a result.
1196 disable_higher_order_debug = 1;
1203 * Switch off all debugging measures.
1208 * Determine which debug features should be switched on
1210 for (; *str && *str != ','; str++) {
1211 switch (tolower(*str)) {
1213 slub_debug |= SLAB_DEBUG_FREE;
1216 slub_debug |= SLAB_RED_ZONE;
1219 slub_debug |= SLAB_POISON;
1222 slub_debug |= SLAB_STORE_USER;
1225 slub_debug |= SLAB_TRACE;
1228 slub_debug |= SLAB_FAILSLAB;
1231 printk(KERN_ERR "slub_debug option '%c' "
1232 "unknown. skipped\n", *str);
1238 slub_debug_slabs = str + 1;
1243 __setup("slub_debug", setup_slub_debug);
1245 static unsigned long kmem_cache_flags(unsigned long objsize,
1246 unsigned long flags, const char *name,
1247 void (*ctor)(void *))
1250 * Enable debugging if selected on the kernel commandline.
1252 if (slub_debug && (!slub_debug_slabs ||
1253 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1254 flags |= slub_debug;
1259 static inline void setup_object_debug(struct kmem_cache *s,
1260 struct page *page, void *object) {}
1262 static inline int alloc_debug_processing(struct kmem_cache *s,
1263 struct page *page, void *object, unsigned long addr) { return 0; }
1265 static inline int free_debug_processing(struct kmem_cache *s,
1266 struct page *page, void *object, unsigned long addr) { return 0; }
1268 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1270 static inline int check_object(struct kmem_cache *s, struct page *page,
1271 void *object, u8 val) { return 1; }
1272 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1273 struct page *page) {}
1274 static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1275 static inline unsigned long kmem_cache_flags(unsigned long objsize,
1276 unsigned long flags, const char *name,
1277 void (*ctor)(void *))
1281 #define slub_debug 0
1283 #define disable_higher_order_debug 0
1285 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1287 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1289 static inline void inc_slabs_node(struct kmem_cache *s, int node,
1291 static inline void dec_slabs_node(struct kmem_cache *s, int node,
1294 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1297 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1300 static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1302 #endif /* CONFIG_SLUB_DEBUG */
1305 * Slab allocation and freeing
1307 static inline struct page *alloc_slab_page(gfp_t flags, int node,
1308 struct kmem_cache_order_objects oo)
1310 int order = oo_order(oo);
1312 flags |= __GFP_NOTRACK;
1314 if (node == NUMA_NO_NODE)
1315 return alloc_pages(flags, order);
1317 return alloc_pages_exact_node(node, flags, order);
1320 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1323 struct kmem_cache_order_objects oo = s->oo;
1326 flags &= gfp_allowed_mask;
1328 if (flags & __GFP_WAIT)
1331 flags |= s->allocflags;
1334 * Let the initial higher-order allocation fail under memory pressure
1335 * so we fall-back to the minimum order allocation.
1337 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1339 page = alloc_slab_page(alloc_gfp, node, oo);
1340 if (unlikely(!page)) {
1343 * Allocation may have failed due to fragmentation.
1344 * Try a lower order alloc if possible
1346 page = alloc_slab_page(flags, node, oo);
1349 stat(s, ORDER_FALLBACK);
1352 if (flags & __GFP_WAIT)
1353 local_irq_disable();
1358 if (kmemcheck_enabled
1359 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1360 int pages = 1 << oo_order(oo);
1362 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1365 * Objects from caches that have a constructor don't get
1366 * cleared when they're allocated, so we need to do it here.
1369 kmemcheck_mark_uninitialized_pages(page, pages);
1371 kmemcheck_mark_unallocated_pages(page, pages);
1374 page->objects = oo_objects(oo);
1375 mod_zone_page_state(page_zone(page),
1376 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1377 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1383 static void setup_object(struct kmem_cache *s, struct page *page,
1386 setup_object_debug(s, page, object);
1387 if (unlikely(s->ctor))
1391 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1398 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1400 page = allocate_slab(s,
1401 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1405 inc_slabs_node(s, page_to_nid(page), page->objects);
1407 page->flags |= 1 << PG_slab;
1409 start = page_address(page);
1411 if (unlikely(s->flags & SLAB_POISON))
1412 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1415 for_each_object(p, s, start, page->objects) {
1416 setup_object(s, page, last);
1417 set_freepointer(s, last, p);
1420 setup_object(s, page, last);
1421 set_freepointer(s, last, NULL);
1423 page->freelist = start;
1430 static void __free_slab(struct kmem_cache *s, struct page *page)
1432 int order = compound_order(page);
1433 int pages = 1 << order;
1435 if (kmem_cache_debug(s)) {
1438 slab_pad_check(s, page);
1439 for_each_object(p, s, page_address(page),
1441 check_object(s, page, p, SLUB_RED_INACTIVE);
1444 kmemcheck_free_shadow(page, compound_order(page));
1446 mod_zone_page_state(page_zone(page),
1447 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1448 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1451 __ClearPageSlab(page);
1452 reset_page_mapcount(page);
1453 if (current->reclaim_state)
1454 current->reclaim_state->reclaimed_slab += pages;
1455 __free_pages(page, order);
1458 #define need_reserve_slab_rcu \
1459 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1461 static void rcu_free_slab(struct rcu_head *h)
1465 if (need_reserve_slab_rcu)
1466 page = virt_to_head_page(h);
1468 page = container_of((struct list_head *)h, struct page, lru);
1470 __free_slab(page->slab, page);
1473 static void free_slab(struct kmem_cache *s, struct page *page)
1475 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1476 struct rcu_head *head;
1478 if (need_reserve_slab_rcu) {
1479 int order = compound_order(page);
1480 int offset = (PAGE_SIZE << order) - s->reserved;
1482 VM_BUG_ON(s->reserved != sizeof(*head));
1483 head = page_address(page) + offset;
1486 * RCU free overloads the RCU head over the LRU
1488 head = (void *)&page->lru;
1491 call_rcu(head, rcu_free_slab);
1493 __free_slab(s, page);
1496 static void discard_slab(struct kmem_cache *s, struct page *page)
1498 dec_slabs_node(s, page_to_nid(page), page->objects);
1503 * Management of partially allocated slabs.
1505 * list_lock must be held.
1507 static inline void add_partial(struct kmem_cache_node *n,
1508 struct page *page, int tail)
1511 if (tail == DEACTIVATE_TO_TAIL)
1512 list_add_tail(&page->lru, &n->partial);
1514 list_add(&page->lru, &n->partial);
1518 * list_lock must be held.
1520 static inline void remove_partial(struct kmem_cache_node *n,
1523 list_del(&page->lru);
1528 * Lock slab, remove from the partial list and put the object into the
1531 * Must hold list_lock.
1533 static inline int acquire_slab(struct kmem_cache *s,
1534 struct kmem_cache_node *n, struct page *page)
1537 unsigned long counters;
1541 * Zap the freelist and set the frozen bit.
1542 * The old freelist is the list of objects for the
1543 * per cpu allocation list.
1546 freelist = page->freelist;
1547 counters = page->counters;
1548 new.counters = counters;
1549 new.inuse = page->objects;
1551 VM_BUG_ON(new.frozen);
1554 } while (!__cmpxchg_double_slab(s, page,
1557 "lock and freeze"));
1559 remove_partial(n, page);
1562 /* Populate the per cpu freelist */
1563 this_cpu_write(s->cpu_slab->freelist, freelist);
1564 this_cpu_write(s->cpu_slab->page, page);
1565 this_cpu_write(s->cpu_slab->node, page_to_nid(page));
1569 * Slab page came from the wrong list. No object to allocate
1570 * from. Put it onto the correct list and continue partial
1573 printk(KERN_ERR "SLUB: %s : Page without available objects on"
1574 " partial list\n", s->name);
1580 * Try to allocate a partial slab from a specific node.
1582 static struct page *get_partial_node(struct kmem_cache *s,
1583 struct kmem_cache_node *n)
1588 * Racy check. If we mistakenly see no partial slabs then we
1589 * just allocate an empty slab. If we mistakenly try to get a
1590 * partial slab and there is none available then get_partials()
1593 if (!n || !n->nr_partial)
1596 spin_lock(&n->list_lock);
1597 list_for_each_entry(page, &n->partial, lru)
1598 if (acquire_slab(s, n, page))
1602 spin_unlock(&n->list_lock);
1607 * Get a page from somewhere. Search in increasing NUMA distances.
1609 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1612 struct zonelist *zonelist;
1615 enum zone_type high_zoneidx = gfp_zone(flags);
1619 * The defrag ratio allows a configuration of the tradeoffs between
1620 * inter node defragmentation and node local allocations. A lower
1621 * defrag_ratio increases the tendency to do local allocations
1622 * instead of attempting to obtain partial slabs from other nodes.
1624 * If the defrag_ratio is set to 0 then kmalloc() always
1625 * returns node local objects. If the ratio is higher then kmalloc()
1626 * may return off node objects because partial slabs are obtained
1627 * from other nodes and filled up.
1629 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1630 * defrag_ratio = 1000) then every (well almost) allocation will
1631 * first attempt to defrag slab caches on other nodes. This means
1632 * scanning over all nodes to look for partial slabs which may be
1633 * expensive if we do it every time we are trying to find a slab
1634 * with available objects.
1636 if (!s->remote_node_defrag_ratio ||
1637 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1641 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1642 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1643 struct kmem_cache_node *n;
1645 n = get_node(s, zone_to_nid(zone));
1647 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1648 n->nr_partial > s->min_partial) {
1649 page = get_partial_node(s, n);
1662 * Get a partial page, lock it and return it.
1664 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1667 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1669 page = get_partial_node(s, get_node(s, searchnode));
1670 if (page || node != NUMA_NO_NODE)
1673 return get_any_partial(s, flags);
1676 #ifdef CONFIG_PREEMPT
1678 * Calculate the next globally unique transaction for disambiguiation
1679 * during cmpxchg. The transactions start with the cpu number and are then
1680 * incremented by CONFIG_NR_CPUS.
1682 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1685 * No preemption supported therefore also no need to check for
1691 static inline unsigned long next_tid(unsigned long tid)
1693 return tid + TID_STEP;
1696 static inline unsigned int tid_to_cpu(unsigned long tid)
1698 return tid % TID_STEP;
1701 static inline unsigned long tid_to_event(unsigned long tid)
1703 return tid / TID_STEP;
1706 static inline unsigned int init_tid(int cpu)
1711 static inline void note_cmpxchg_failure(const char *n,
1712 const struct kmem_cache *s, unsigned long tid)
1714 #ifdef SLUB_DEBUG_CMPXCHG
1715 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1717 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1719 #ifdef CONFIG_PREEMPT
1720 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1721 printk("due to cpu change %d -> %d\n",
1722 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1725 if (tid_to_event(tid) != tid_to_event(actual_tid))
1726 printk("due to cpu running other code. Event %ld->%ld\n",
1727 tid_to_event(tid), tid_to_event(actual_tid));
1729 printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1730 actual_tid, tid, next_tid(tid));
1732 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1735 void init_kmem_cache_cpus(struct kmem_cache *s)
1739 for_each_possible_cpu(cpu)
1740 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1743 * Remove the cpu slab
1747 * Remove the cpu slab
1749 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1751 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1752 struct page *page = c->page;
1753 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1755 enum slab_modes l = M_NONE, m = M_NONE;
1758 int tail = DEACTIVATE_TO_HEAD;
1762 if (page->freelist) {
1763 stat(s, DEACTIVATE_REMOTE_FREES);
1764 tail = DEACTIVATE_TO_TAIL;
1767 c->tid = next_tid(c->tid);
1769 freelist = c->freelist;
1773 * Stage one: Free all available per cpu objects back
1774 * to the page freelist while it is still frozen. Leave the
1777 * There is no need to take the list->lock because the page
1780 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1782 unsigned long counters;
1785 prior = page->freelist;
1786 counters = page->counters;
1787 set_freepointer(s, freelist, prior);
1788 new.counters = counters;
1790 VM_BUG_ON(!new.frozen);
1792 } while (!__cmpxchg_double_slab(s, page,
1794 freelist, new.counters,
1795 "drain percpu freelist"));
1797 freelist = nextfree;
1801 * Stage two: Ensure that the page is unfrozen while the
1802 * list presence reflects the actual number of objects
1805 * We setup the list membership and then perform a cmpxchg
1806 * with the count. If there is a mismatch then the page
1807 * is not unfrozen but the page is on the wrong list.
1809 * Then we restart the process which may have to remove
1810 * the page from the list that we just put it on again
1811 * because the number of objects in the slab may have
1816 old.freelist = page->freelist;
1817 old.counters = page->counters;
1818 VM_BUG_ON(!old.frozen);
1820 /* Determine target state of the slab */
1821 new.counters = old.counters;
1824 set_freepointer(s, freelist, old.freelist);
1825 new.freelist = freelist;
1827 new.freelist = old.freelist;
1831 if (!new.inuse && n->nr_partial > s->min_partial)
1833 else if (new.freelist) {
1838 * Taking the spinlock removes the possiblity
1839 * that acquire_slab() will see a slab page that
1842 spin_lock(&n->list_lock);
1846 if (kmem_cache_debug(s) && !lock) {
1849 * This also ensures that the scanning of full
1850 * slabs from diagnostic functions will not see
1853 spin_lock(&n->list_lock);
1861 remove_partial(n, page);
1863 else if (l == M_FULL)
1865 remove_full(s, page);
1867 if (m == M_PARTIAL) {
1869 add_partial(n, page, tail);
1872 } else if (m == M_FULL) {
1874 stat(s, DEACTIVATE_FULL);
1875 add_full(s, n, page);
1881 if (!__cmpxchg_double_slab(s, page,
1882 old.freelist, old.counters,
1883 new.freelist, new.counters,
1888 spin_unlock(&n->list_lock);
1891 stat(s, DEACTIVATE_EMPTY);
1892 discard_slab(s, page);
1897 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1899 stat(s, CPUSLAB_FLUSH);
1900 deactivate_slab(s, c);
1906 * Called from IPI handler with interrupts disabled.
1908 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1910 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1912 if (likely(c && c->page))
1916 static void flush_cpu_slab(void *d)
1918 struct kmem_cache *s = d;
1920 __flush_cpu_slab(s, smp_processor_id());
1923 static void flush_all(struct kmem_cache *s)
1925 on_each_cpu(flush_cpu_slab, s, 1);
1929 * Check if the objects in a per cpu structure fit numa
1930 * locality expectations.
1932 static inline int node_match(struct kmem_cache_cpu *c, int node)
1935 if (node != NUMA_NO_NODE && c->node != node)
1941 static int count_free(struct page *page)
1943 return page->objects - page->inuse;
1946 static unsigned long count_partial(struct kmem_cache_node *n,
1947 int (*get_count)(struct page *))
1949 unsigned long flags;
1950 unsigned long x = 0;
1953 spin_lock_irqsave(&n->list_lock, flags);
1954 list_for_each_entry(page, &n->partial, lru)
1955 x += get_count(page);
1956 spin_unlock_irqrestore(&n->list_lock, flags);
1960 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1962 #ifdef CONFIG_SLUB_DEBUG
1963 return atomic_long_read(&n->total_objects);
1969 static noinline void
1970 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1975 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1977 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
1978 "default order: %d, min order: %d\n", s->name, s->objsize,
1979 s->size, oo_order(s->oo), oo_order(s->min));
1981 if (oo_order(s->min) > get_order(s->objsize))
1982 printk(KERN_WARNING " %s debugging increased min order, use "
1983 "slub_debug=O to disable.\n", s->name);
1985 for_each_online_node(node) {
1986 struct kmem_cache_node *n = get_node(s, node);
1987 unsigned long nr_slabs;
1988 unsigned long nr_objs;
1989 unsigned long nr_free;
1994 nr_free = count_partial(n, count_free);
1995 nr_slabs = node_nr_slabs(n);
1996 nr_objs = node_nr_objs(n);
1999 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
2000 node, nr_slabs, nr_objs, nr_free);
2005 * Slow path. The lockless freelist is empty or we need to perform
2008 * Interrupts are disabled.
2010 * Processing is still very fast if new objects have been freed to the
2011 * regular freelist. In that case we simply take over the regular freelist
2012 * as the lockless freelist and zap the regular freelist.
2014 * If that is not working then we fall back to the partial lists. We take the
2015 * first element of the freelist as the object to allocate now and move the
2016 * rest of the freelist to the lockless freelist.
2018 * And if we were unable to get a new slab from the partial slab lists then
2019 * we need to allocate a new slab. This is the slowest path since it involves
2020 * a call to the page allocator and the setup of a new slab.
2022 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2023 unsigned long addr, struct kmem_cache_cpu *c)
2027 unsigned long flags;
2029 unsigned long counters;
2031 local_irq_save(flags);
2032 #ifdef CONFIG_PREEMPT
2034 * We may have been preempted and rescheduled on a different
2035 * cpu before disabling interrupts. Need to reload cpu area
2038 c = this_cpu_ptr(s->cpu_slab);
2041 /* We handle __GFP_ZERO in the caller */
2042 gfpflags &= ~__GFP_ZERO;
2048 if (unlikely(!node_match(c, node))) {
2049 stat(s, ALLOC_NODE_MISMATCH);
2050 deactivate_slab(s, c);
2054 stat(s, ALLOC_SLOWPATH);
2057 object = page->freelist;
2058 counters = page->counters;
2059 new.counters = counters;
2060 VM_BUG_ON(!new.frozen);
2063 * If there is no object left then we use this loop to
2064 * deactivate the slab which is simple since no objects
2065 * are left in the slab and therefore we do not need to
2066 * put the page back onto the partial list.
2068 * If there are objects left then we retrieve them
2069 * and use them to refill the per cpu queue.
2072 new.inuse = page->objects;
2073 new.frozen = object != NULL;
2075 } while (!__cmpxchg_double_slab(s, page,
2080 if (unlikely(!object)) {
2082 stat(s, DEACTIVATE_BYPASS);
2086 stat(s, ALLOC_REFILL);
2089 VM_BUG_ON(!page->frozen);
2090 c->freelist = get_freepointer(s, object);
2091 c->tid = next_tid(c->tid);
2092 local_irq_restore(flags);
2096 page = get_partial(s, gfpflags, node);
2098 stat(s, ALLOC_FROM_PARTIAL);
2099 object = c->freelist;
2101 if (kmem_cache_debug(s))
2106 page = new_slab(s, gfpflags, node);
2109 c = __this_cpu_ptr(s->cpu_slab);
2114 * No other reference to the page yet so we can
2115 * muck around with it freely without cmpxchg
2117 object = page->freelist;
2118 page->freelist = NULL;
2119 page->inuse = page->objects;
2121 stat(s, ALLOC_SLAB);
2122 c->node = page_to_nid(page);
2125 if (kmem_cache_debug(s))
2129 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2130 slab_out_of_memory(s, gfpflags, node);
2131 local_irq_restore(flags);
2135 if (!object || !alloc_debug_processing(s, page, object, addr))
2138 c->freelist = get_freepointer(s, object);
2139 deactivate_slab(s, c);
2141 c->node = NUMA_NO_NODE;
2142 local_irq_restore(flags);
2147 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2148 * have the fastpath folded into their functions. So no function call
2149 * overhead for requests that can be satisfied on the fastpath.
2151 * The fastpath works by first checking if the lockless freelist can be used.
2152 * If not then __slab_alloc is called for slow processing.
2154 * Otherwise we can simply pick the next object from the lockless free list.
2156 static __always_inline void *slab_alloc(struct kmem_cache *s,
2157 gfp_t gfpflags, int node, unsigned long addr)
2160 struct kmem_cache_cpu *c;
2163 if (slab_pre_alloc_hook(s, gfpflags))
2169 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2170 * enabled. We may switch back and forth between cpus while
2171 * reading from one cpu area. That does not matter as long
2172 * as we end up on the original cpu again when doing the cmpxchg.
2174 c = __this_cpu_ptr(s->cpu_slab);
2177 * The transaction ids are globally unique per cpu and per operation on
2178 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2179 * occurs on the right processor and that there was no operation on the
2180 * linked list in between.
2185 object = c->freelist;
2186 if (unlikely(!object || !node_match(c, node)))
2188 object = __slab_alloc(s, gfpflags, node, addr, c);
2192 * The cmpxchg will only match if there was no additional
2193 * operation and if we are on the right processor.
2195 * The cmpxchg does the following atomically (without lock semantics!)
2196 * 1. Relocate first pointer to the current per cpu area.
2197 * 2. Verify that tid and freelist have not been changed
2198 * 3. If they were not changed replace tid and freelist
2200 * Since this is without lock semantics the protection is only against
2201 * code executing on this cpu *not* from access by other cpus.
2203 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2204 s->cpu_slab->freelist, s->cpu_slab->tid,
2206 get_freepointer_safe(s, object), next_tid(tid)))) {
2208 note_cmpxchg_failure("slab_alloc", s, tid);
2211 stat(s, ALLOC_FASTPATH);
2214 if (unlikely(gfpflags & __GFP_ZERO) && object)
2215 memset(object, 0, s->objsize);
2217 slab_post_alloc_hook(s, gfpflags, object);
2222 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2224 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2226 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
2230 EXPORT_SYMBOL(kmem_cache_alloc);
2232 #ifdef CONFIG_TRACING
2233 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2235 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2236 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2239 EXPORT_SYMBOL(kmem_cache_alloc_trace);
2241 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
2243 void *ret = kmalloc_order(size, flags, order);
2244 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
2247 EXPORT_SYMBOL(kmalloc_order_trace);
2251 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2253 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2255 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2256 s->objsize, s->size, gfpflags, node);
2260 EXPORT_SYMBOL(kmem_cache_alloc_node);
2262 #ifdef CONFIG_TRACING
2263 void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2265 int node, size_t size)
2267 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2269 trace_kmalloc_node(_RET_IP_, ret,
2270 size, s->size, gfpflags, node);
2273 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2278 * Slow patch handling. This may still be called frequently since objects
2279 * have a longer lifetime than the cpu slabs in most processing loads.
2281 * So we still attempt to reduce cache line usage. Just take the slab
2282 * lock and free the item. If there is no additional partial page
2283 * handling required then we can return immediately.
2285 static void __slab_free(struct kmem_cache *s, struct page *page,
2286 void *x, unsigned long addr)
2289 void **object = (void *)x;
2293 unsigned long counters;
2294 struct kmem_cache_node *n = NULL;
2295 unsigned long uninitialized_var(flags);
2297 stat(s, FREE_SLOWPATH);
2299 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2303 prior = page->freelist;
2304 counters = page->counters;
2305 set_freepointer(s, object, prior);
2306 new.counters = counters;
2307 was_frozen = new.frozen;
2309 if ((!new.inuse || !prior) && !was_frozen && !n) {
2310 n = get_node(s, page_to_nid(page));
2312 * Speculatively acquire the list_lock.
2313 * If the cmpxchg does not succeed then we may
2314 * drop the list_lock without any processing.
2316 * Otherwise the list_lock will synchronize with
2317 * other processors updating the list of slabs.
2319 spin_lock_irqsave(&n->list_lock, flags);
2323 } while (!cmpxchg_double_slab(s, page,
2325 object, new.counters,
2330 * The list lock was not taken therefore no list
2331 * activity can be necessary.
2334 stat(s, FREE_FROZEN);
2339 * was_frozen may have been set after we acquired the list_lock in
2340 * an earlier loop. So we need to check it here again.
2343 stat(s, FREE_FROZEN);
2345 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2349 * Objects left in the slab. If it was not on the partial list before
2352 if (unlikely(!prior)) {
2353 remove_full(s, page);
2354 add_partial(n, page, DEACTIVATE_TO_TAIL);
2355 stat(s, FREE_ADD_PARTIAL);
2358 spin_unlock_irqrestore(&n->list_lock, flags);
2364 * Slab on the partial list.
2366 remove_partial(n, page);
2367 stat(s, FREE_REMOVE_PARTIAL);
2369 /* Slab must be on the full list */
2370 remove_full(s, page);
2372 spin_unlock_irqrestore(&n->list_lock, flags);
2374 discard_slab(s, page);
2378 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2379 * can perform fastpath freeing without additional function calls.
2381 * The fastpath is only possible if we are freeing to the current cpu slab
2382 * of this processor. This typically the case if we have just allocated
2385 * If fastpath is not possible then fall back to __slab_free where we deal
2386 * with all sorts of special processing.
2388 static __always_inline void slab_free(struct kmem_cache *s,
2389 struct page *page, void *x, unsigned long addr)
2391 void **object = (void *)x;
2392 struct kmem_cache_cpu *c;
2395 slab_free_hook(s, x);
2400 * Determine the currently cpus per cpu slab.
2401 * The cpu may change afterward. However that does not matter since
2402 * data is retrieved via this pointer. If we are on the same cpu
2403 * during the cmpxchg then the free will succedd.
2405 c = __this_cpu_ptr(s->cpu_slab);
2410 if (likely(page == c->page)) {
2411 set_freepointer(s, object, c->freelist);
2413 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2414 s->cpu_slab->freelist, s->cpu_slab->tid,
2416 object, next_tid(tid)))) {
2418 note_cmpxchg_failure("slab_free", s, tid);
2421 stat(s, FREE_FASTPATH);
2423 __slab_free(s, page, x, addr);
2427 void kmem_cache_free(struct kmem_cache *s, void *x)
2431 page = virt_to_head_page(x);
2433 slab_free(s, page, x, _RET_IP_);
2435 trace_kmem_cache_free(_RET_IP_, x);
2437 EXPORT_SYMBOL(kmem_cache_free);
2440 * Object placement in a slab is made very easy because we always start at
2441 * offset 0. If we tune the size of the object to the alignment then we can
2442 * get the required alignment by putting one properly sized object after
2445 * Notice that the allocation order determines the sizes of the per cpu
2446 * caches. Each processor has always one slab available for allocations.
2447 * Increasing the allocation order reduces the number of times that slabs
2448 * must be moved on and off the partial lists and is therefore a factor in
2453 * Mininum / Maximum order of slab pages. This influences locking overhead
2454 * and slab fragmentation. A higher order reduces the number of partial slabs
2455 * and increases the number of allocations possible without having to
2456 * take the list_lock.
2458 static int slub_min_order;
2459 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2460 static int slub_min_objects;
2463 * Merge control. If this is set then no merging of slab caches will occur.
2464 * (Could be removed. This was introduced to pacify the merge skeptics.)
2466 static int slub_nomerge;
2469 * Calculate the order of allocation given an slab object size.
2471 * The order of allocation has significant impact on performance and other
2472 * system components. Generally order 0 allocations should be preferred since
2473 * order 0 does not cause fragmentation in the page allocator. Larger objects
2474 * be problematic to put into order 0 slabs because there may be too much
2475 * unused space left. We go to a higher order if more than 1/16th of the slab
2478 * In order to reach satisfactory performance we must ensure that a minimum
2479 * number of objects is in one slab. Otherwise we may generate too much
2480 * activity on the partial lists which requires taking the list_lock. This is
2481 * less a concern for large slabs though which are rarely used.
2483 * slub_max_order specifies the order where we begin to stop considering the
2484 * number of objects in a slab as critical. If we reach slub_max_order then
2485 * we try to keep the page order as low as possible. So we accept more waste
2486 * of space in favor of a small page order.
2488 * Higher order allocations also allow the placement of more objects in a
2489 * slab and thereby reduce object handling overhead. If the user has
2490 * requested a higher mininum order then we start with that one instead of
2491 * the smallest order which will fit the object.
2493 static inline int slab_order(int size, int min_objects,
2494 int max_order, int fract_leftover, int reserved)
2498 int min_order = slub_min_order;
2500 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2501 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2503 for (order = max(min_order,
2504 fls(min_objects * size - 1) - PAGE_SHIFT);
2505 order <= max_order; order++) {
2507 unsigned long slab_size = PAGE_SIZE << order;
2509 if (slab_size < min_objects * size + reserved)
2512 rem = (slab_size - reserved) % size;
2514 if (rem <= slab_size / fract_leftover)
2522 static inline int calculate_order(int size, int reserved)
2530 * Attempt to find best configuration for a slab. This
2531 * works by first attempting to generate a layout with
2532 * the best configuration and backing off gradually.
2534 * First we reduce the acceptable waste in a slab. Then
2535 * we reduce the minimum objects required in a slab.
2537 min_objects = slub_min_objects;
2539 min_objects = 4 * (fls(nr_cpu_ids) + 1);
2540 max_objects = order_objects(slub_max_order, size, reserved);
2541 min_objects = min(min_objects, max_objects);
2543 while (min_objects > 1) {
2545 while (fraction >= 4) {
2546 order = slab_order(size, min_objects,
2547 slub_max_order, fraction, reserved);
2548 if (order <= slub_max_order)
2556 * We were unable to place multiple objects in a slab. Now
2557 * lets see if we can place a single object there.
2559 order = slab_order(size, 1, slub_max_order, 1, reserved);
2560 if (order <= slub_max_order)
2564 * Doh this slab cannot be placed using slub_max_order.
2566 order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2567 if (order < MAX_ORDER)
2573 * Figure out what the alignment of the objects will be.
2575 static unsigned long calculate_alignment(unsigned long flags,
2576 unsigned long align, unsigned long size)
2579 * If the user wants hardware cache aligned objects then follow that
2580 * suggestion if the object is sufficiently large.
2582 * The hardware cache alignment cannot override the specified
2583 * alignment though. If that is greater then use it.
2585 if (flags & SLAB_HWCACHE_ALIGN) {
2586 unsigned long ralign = cache_line_size();
2587 while (size <= ralign / 2)
2589 align = max(align, ralign);
2592 if (align < ARCH_SLAB_MINALIGN)
2593 align = ARCH_SLAB_MINALIGN;
2595 return ALIGN(align, sizeof(void *));
2599 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2602 spin_lock_init(&n->list_lock);
2603 INIT_LIST_HEAD(&n->partial);
2604 #ifdef CONFIG_SLUB_DEBUG
2605 atomic_long_set(&n->nr_slabs, 0);
2606 atomic_long_set(&n->total_objects, 0);
2607 INIT_LIST_HEAD(&n->full);
2611 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2613 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2614 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2617 * Must align to double word boundary for the double cmpxchg
2618 * instructions to work; see __pcpu_double_call_return_bool().
2620 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2621 2 * sizeof(void *));
2626 init_kmem_cache_cpus(s);
2631 static struct kmem_cache *kmem_cache_node;
2634 * No kmalloc_node yet so do it by hand. We know that this is the first
2635 * slab on the node for this slabcache. There are no concurrent accesses
2638 * Note that this function only works on the kmalloc_node_cache
2639 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2640 * memory on a fresh node that has no slab structures yet.
2642 static void early_kmem_cache_node_alloc(int node)
2645 struct kmem_cache_node *n;
2647 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2649 page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2652 if (page_to_nid(page) != node) {
2653 printk(KERN_ERR "SLUB: Unable to allocate memory from "
2655 printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2656 "in order to be able to continue\n");
2661 page->freelist = get_freepointer(kmem_cache_node, n);
2664 kmem_cache_node->node[node] = n;
2665 #ifdef CONFIG_SLUB_DEBUG
2666 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2667 init_tracking(kmem_cache_node, n);
2669 init_kmem_cache_node(n, kmem_cache_node);
2670 inc_slabs_node(kmem_cache_node, node, page->objects);
2672 add_partial(n, page, DEACTIVATE_TO_HEAD);
2675 static void free_kmem_cache_nodes(struct kmem_cache *s)
2679 for_each_node_state(node, N_NORMAL_MEMORY) {
2680 struct kmem_cache_node *n = s->node[node];
2683 kmem_cache_free(kmem_cache_node, n);
2685 s->node[node] = NULL;
2689 static int init_kmem_cache_nodes(struct kmem_cache *s)
2693 for_each_node_state(node, N_NORMAL_MEMORY) {
2694 struct kmem_cache_node *n;
2696 if (slab_state == DOWN) {
2697 early_kmem_cache_node_alloc(node);
2700 n = kmem_cache_alloc_node(kmem_cache_node,
2704 free_kmem_cache_nodes(s);
2709 init_kmem_cache_node(n, s);
2714 static void set_min_partial(struct kmem_cache *s, unsigned long min)
2716 if (min < MIN_PARTIAL)
2718 else if (min > MAX_PARTIAL)
2720 s->min_partial = min;
2724 * calculate_sizes() determines the order and the distribution of data within
2727 static int calculate_sizes(struct kmem_cache *s, int forced_order)
2729 unsigned long flags = s->flags;
2730 unsigned long size = s->objsize;
2731 unsigned long align = s->align;
2735 * Round up object size to the next word boundary. We can only
2736 * place the free pointer at word boundaries and this determines
2737 * the possible location of the free pointer.
2739 size = ALIGN(size, sizeof(void *));
2741 #ifdef CONFIG_SLUB_DEBUG
2743 * Determine if we can poison the object itself. If the user of
2744 * the slab may touch the object after free or before allocation
2745 * then we should never poison the object itself.
2747 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2749 s->flags |= __OBJECT_POISON;
2751 s->flags &= ~__OBJECT_POISON;
2755 * If we are Redzoning then check if there is some space between the
2756 * end of the object and the free pointer. If not then add an
2757 * additional word to have some bytes to store Redzone information.
2759 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2760 size += sizeof(void *);
2764 * With that we have determined the number of bytes in actual use
2765 * by the object. This is the potential offset to the free pointer.
2769 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2772 * Relocate free pointer after the object if it is not
2773 * permitted to overwrite the first word of the object on
2776 * This is the case if we do RCU, have a constructor or
2777 * destructor or are poisoning the objects.
2780 size += sizeof(void *);
2783 #ifdef CONFIG_SLUB_DEBUG
2784 if (flags & SLAB_STORE_USER)
2786 * Need to store information about allocs and frees after
2789 size += 2 * sizeof(struct track);
2791 if (flags & SLAB_RED_ZONE)
2793 * Add some empty padding so that we can catch
2794 * overwrites from earlier objects rather than let
2795 * tracking information or the free pointer be
2796 * corrupted if a user writes before the start
2799 size += sizeof(void *);
2803 * Determine the alignment based on various parameters that the
2804 * user specified and the dynamic determination of cache line size
2807 align = calculate_alignment(flags, align, s->objsize);
2811 * SLUB stores one object immediately after another beginning from
2812 * offset 0. In order to align the objects we have to simply size
2813 * each object to conform to the alignment.
2815 size = ALIGN(size, align);
2817 if (forced_order >= 0)
2818 order = forced_order;
2820 order = calculate_order(size, s->reserved);
2827 s->allocflags |= __GFP_COMP;
2829 if (s->flags & SLAB_CACHE_DMA)
2830 s->allocflags |= SLUB_DMA;
2832 if (s->flags & SLAB_RECLAIM_ACCOUNT)
2833 s->allocflags |= __GFP_RECLAIMABLE;
2836 * Determine the number of objects per slab
2838 s->oo = oo_make(order, size, s->reserved);
2839 s->min = oo_make(get_order(size), size, s->reserved);
2840 if (oo_objects(s->oo) > oo_objects(s->max))
2843 return !!oo_objects(s->oo);
2847 static int kmem_cache_open(struct kmem_cache *s,
2848 const char *name, size_t size,
2849 size_t align, unsigned long flags,
2850 void (*ctor)(void *))
2852 memset(s, 0, kmem_size);
2857 s->flags = kmem_cache_flags(size, flags, name, ctor);
2860 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
2861 s->reserved = sizeof(struct rcu_head);
2863 if (!calculate_sizes(s, -1))
2865 if (disable_higher_order_debug) {
2867 * Disable debugging flags that store metadata if the min slab
2870 if (get_order(s->size) > get_order(s->objsize)) {
2871 s->flags &= ~DEBUG_METADATA_FLAGS;
2873 if (!calculate_sizes(s, -1))
2878 #ifdef CONFIG_CMPXCHG_DOUBLE
2879 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2880 /* Enable fast mode */
2881 s->flags |= __CMPXCHG_DOUBLE;
2885 * The larger the object size is, the more pages we want on the partial
2886 * list to avoid pounding the page allocator excessively.
2888 set_min_partial(s, ilog2(s->size));
2891 s->remote_node_defrag_ratio = 1000;
2893 if (!init_kmem_cache_nodes(s))
2896 if (alloc_kmem_cache_cpus(s))
2899 free_kmem_cache_nodes(s);
2901 if (flags & SLAB_PANIC)
2902 panic("Cannot create slab %s size=%lu realsize=%u "
2903 "order=%u offset=%u flags=%lx\n",
2904 s->name, (unsigned long)size, s->size, oo_order(s->oo),
2910 * Determine the size of a slab object
2912 unsigned int kmem_cache_size(struct kmem_cache *s)
2916 EXPORT_SYMBOL(kmem_cache_size);
2918 static void list_slab_objects(struct kmem_cache *s, struct page *page,
2921 #ifdef CONFIG_SLUB_DEBUG
2922 void *addr = page_address(page);
2924 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2925 sizeof(long), GFP_ATOMIC);
2928 slab_err(s, page, "%s", text);
2931 get_map(s, page, map);
2932 for_each_object(p, s, addr, page->objects) {
2934 if (!test_bit(slab_index(p, s, addr), map)) {
2935 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
2937 print_tracking(s, p);
2946 * Attempt to free all partial slabs on a node.
2948 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2950 unsigned long flags;
2951 struct page *page, *h;
2953 spin_lock_irqsave(&n->list_lock, flags);
2954 list_for_each_entry_safe(page, h, &n->partial, lru) {
2956 remove_partial(n, page);
2957 discard_slab(s, page);
2959 list_slab_objects(s, page,
2960 "Objects remaining on kmem_cache_close()");
2963 spin_unlock_irqrestore(&n->list_lock, flags);
2967 * Release all resources used by a slab cache.
2969 static inline int kmem_cache_close(struct kmem_cache *s)
2974 free_percpu(s->cpu_slab);
2975 /* Attempt to free all objects */
2976 for_each_node_state(node, N_NORMAL_MEMORY) {
2977 struct kmem_cache_node *n = get_node(s, node);
2980 if (n->nr_partial || slabs_node(s, node))
2983 free_kmem_cache_nodes(s);
2988 * Close a cache and release the kmem_cache structure
2989 * (must be used for caches created using kmem_cache_create)
2991 void kmem_cache_destroy(struct kmem_cache *s)
2993 down_write(&slub_lock);
2997 if (kmem_cache_close(s)) {
2998 printk(KERN_ERR "SLUB %s: %s called for cache that "
2999 "still has objects.\n", s->name, __func__);
3002 if (s->flags & SLAB_DESTROY_BY_RCU)
3004 sysfs_slab_remove(s);
3006 up_write(&slub_lock);
3008 EXPORT_SYMBOL(kmem_cache_destroy);
3010 /********************************************************************
3012 *******************************************************************/
3014 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
3015 EXPORT_SYMBOL(kmalloc_caches);
3017 static struct kmem_cache *kmem_cache;
3019 #ifdef CONFIG_ZONE_DMA
3020 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
3023 static int __init setup_slub_min_order(char *str)
3025 get_option(&str, &slub_min_order);
3030 __setup("slub_min_order=", setup_slub_min_order);
3032 static int __init setup_slub_max_order(char *str)
3034 get_option(&str, &slub_max_order);
3035 slub_max_order = min(slub_max_order, MAX_ORDER - 1);
3040 __setup("slub_max_order=", setup_slub_max_order);
3042 static int __init setup_slub_min_objects(char *str)
3044 get_option(&str, &slub_min_objects);
3049 __setup("slub_min_objects=", setup_slub_min_objects);
3051 static int __init setup_slub_nomerge(char *str)
3057 __setup("slub_nomerge", setup_slub_nomerge);
3059 static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3060 int size, unsigned int flags)
3062 struct kmem_cache *s;
3064 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3067 * This function is called with IRQs disabled during early-boot on
3068 * single CPU so there's no need to take slub_lock here.
3070 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
3074 list_add(&s->list, &slab_caches);
3078 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
3083 * Conversion table for small slabs sizes / 8 to the index in the
3084 * kmalloc array. This is necessary for slabs < 192 since we have non power
3085 * of two cache sizes there. The size of larger slabs can be determined using
3088 static s8 size_index[24] = {
3115 static inline int size_index_elem(size_t bytes)
3117 return (bytes - 1) / 8;
3120 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
3126 return ZERO_SIZE_PTR;
3128 index = size_index[size_index_elem(size)];
3130 index = fls(size - 1);
3132 #ifdef CONFIG_ZONE_DMA
3133 if (unlikely((flags & SLUB_DMA)))
3134 return kmalloc_dma_caches[index];
3137 return kmalloc_caches[index];
3140 void *__kmalloc(size_t size, gfp_t flags)
3142 struct kmem_cache *s;
3145 if (unlikely(size > SLUB_MAX_SIZE))
3146 return kmalloc_large(size, flags);
3148 s = get_slab(size, flags);
3150 if (unlikely(ZERO_OR_NULL_PTR(s)))
3153 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
3155 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3159 EXPORT_SYMBOL(__kmalloc);
3162 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3167 flags |= __GFP_COMP | __GFP_NOTRACK;
3168 page = alloc_pages_node(node, flags, get_order(size));
3170 ptr = page_address(page);
3172 kmemleak_alloc(ptr, size, 1, flags);
3176 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3178 struct kmem_cache *s;
3181 if (unlikely(size > SLUB_MAX_SIZE)) {
3182 ret = kmalloc_large_node(size, flags, node);
3184 trace_kmalloc_node(_RET_IP_, ret,
3185 size, PAGE_SIZE << get_order(size),
3191 s = get_slab(size, flags);
3193 if (unlikely(ZERO_OR_NULL_PTR(s)))
3196 ret = slab_alloc(s, flags, node, _RET_IP_);
3198 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3202 EXPORT_SYMBOL(__kmalloc_node);
3205 size_t ksize(const void *object)
3209 if (unlikely(object == ZERO_SIZE_PTR))
3212 page = virt_to_head_page(object);
3214 if (unlikely(!PageSlab(page))) {
3215 WARN_ON(!PageCompound(page));
3216 return PAGE_SIZE << compound_order(page);
3219 return slab_ksize(page->slab);
3221 EXPORT_SYMBOL(ksize);
3223 #ifdef CONFIG_SLUB_DEBUG
3224 bool verify_mem_not_deleted(const void *x)
3227 void *object = (void *)x;
3228 unsigned long flags;
3231 if (unlikely(ZERO_OR_NULL_PTR(x)))
3234 local_irq_save(flags);
3236 page = virt_to_head_page(x);
3237 if (unlikely(!PageSlab(page))) {
3238 /* maybe it was from stack? */
3244 if (on_freelist(page->slab, page, object)) {
3245 object_err(page->slab, page, object, "Object is on free-list");
3253 local_irq_restore(flags);
3256 EXPORT_SYMBOL(verify_mem_not_deleted);
3259 void kfree(const void *x)
3262 void *object = (void *)x;
3264 trace_kfree(_RET_IP_, x);
3266 if (unlikely(ZERO_OR_NULL_PTR(x)))
3269 page = virt_to_head_page(x);
3270 if (unlikely(!PageSlab(page))) {
3271 BUG_ON(!PageCompound(page));
3276 slab_free(page->slab, page, object, _RET_IP_);
3278 EXPORT_SYMBOL(kfree);
3281 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3282 * the remaining slabs by the number of items in use. The slabs with the
3283 * most items in use come first. New allocations will then fill those up
3284 * and thus they can be removed from the partial lists.
3286 * The slabs with the least items are placed last. This results in them
3287 * being allocated from last increasing the chance that the last objects
3288 * are freed in them.
3290 int kmem_cache_shrink(struct kmem_cache *s)
3294 struct kmem_cache_node *n;
3297 int objects = oo_objects(s->max);
3298 struct list_head *slabs_by_inuse =
3299 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3300 unsigned long flags;
3302 if (!slabs_by_inuse)
3306 for_each_node_state(node, N_NORMAL_MEMORY) {
3307 n = get_node(s, node);
3312 for (i = 0; i < objects; i++)
3313 INIT_LIST_HEAD(slabs_by_inuse + i);
3315 spin_lock_irqsave(&n->list_lock, flags);
3318 * Build lists indexed by the items in use in each slab.
3320 * Note that concurrent frees may occur while we hold the
3321 * list_lock. page->inuse here is the upper limit.
3323 list_for_each_entry_safe(page, t, &n->partial, lru) {
3325 remove_partial(n, page);
3326 discard_slab(s, page);
3328 list_move(&page->lru,
3329 slabs_by_inuse + page->inuse);
3334 * Rebuild the partial list with the slabs filled up most
3335 * first and the least used slabs at the end.
3337 for (i = objects - 1; i >= 0; i--)
3338 list_splice(slabs_by_inuse + i, n->partial.prev);
3340 spin_unlock_irqrestore(&n->list_lock, flags);
3343 kfree(slabs_by_inuse);
3346 EXPORT_SYMBOL(kmem_cache_shrink);
3348 #if defined(CONFIG_MEMORY_HOTPLUG)
3349 static int slab_mem_going_offline_callback(void *arg)
3351 struct kmem_cache *s;
3353 down_read(&slub_lock);
3354 list_for_each_entry(s, &slab_caches, list)
3355 kmem_cache_shrink(s);
3356 up_read(&slub_lock);
3361 static void slab_mem_offline_callback(void *arg)
3363 struct kmem_cache_node *n;
3364 struct kmem_cache *s;
3365 struct memory_notify *marg = arg;
3368 offline_node = marg->status_change_nid;
3371 * If the node still has available memory. we need kmem_cache_node
3374 if (offline_node < 0)
3377 down_read(&slub_lock);
3378 list_for_each_entry(s, &slab_caches, list) {
3379 n = get_node(s, offline_node);
3382 * if n->nr_slabs > 0, slabs still exist on the node
3383 * that is going down. We were unable to free them,
3384 * and offline_pages() function shouldn't call this
3385 * callback. So, we must fail.
3387 BUG_ON(slabs_node(s, offline_node));
3389 s->node[offline_node] = NULL;
3390 kmem_cache_free(kmem_cache_node, n);
3393 up_read(&slub_lock);
3396 static int slab_mem_going_online_callback(void *arg)
3398 struct kmem_cache_node *n;
3399 struct kmem_cache *s;
3400 struct memory_notify *marg = arg;
3401 int nid = marg->status_change_nid;
3405 * If the node's memory is already available, then kmem_cache_node is
3406 * already created. Nothing to do.
3412 * We are bringing a node online. No memory is available yet. We must
3413 * allocate a kmem_cache_node structure in order to bring the node
3416 down_read(&slub_lock);
3417 list_for_each_entry(s, &slab_caches, list) {
3419 * XXX: kmem_cache_alloc_node will fallback to other nodes
3420 * since memory is not yet available from the node that
3423 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3428 init_kmem_cache_node(n, s);