Merge branch 'x86-microcode-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/err.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
16 #include <linux/cpu.h>
17 #include <linux/vmstat.h>
18 #include <linux/sched.h>
19
20 #ifdef CONFIG_VM_EVENT_COUNTERS
21 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
22 EXPORT_PER_CPU_SYMBOL(vm_event_states);
23
24 static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
25 {
26         int cpu;
27         int i;
28
29         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
30
31         for_each_cpu(cpu, cpumask) {
32                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
33
34                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
35                         ret[i] += this->event[i];
36         }
37 }
38
39 /*
40  * Accumulate the vm event counters across all CPUs.
41  * The result is unavoidably approximate - it can change
42  * during and after execution of this function.
43 */
44 void all_vm_events(unsigned long *ret)
45 {
46         get_online_cpus();
47         sum_vm_events(ret, cpu_online_mask);
48         put_online_cpus();
49 }
50 EXPORT_SYMBOL_GPL(all_vm_events);
51
52 #ifdef CONFIG_HOTPLUG
53 /*
54  * Fold the foreign cpu events into our own.
55  *
56  * This is adding to the events on one processor
57  * but keeps the global counts constant.
58  */
59 void vm_events_fold_cpu(int cpu)
60 {
61         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
62         int i;
63
64         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
65                 count_vm_events(i, fold_state->event[i]);
66                 fold_state->event[i] = 0;
67         }
68 }
69 #endif /* CONFIG_HOTPLUG */
70
71 #endif /* CONFIG_VM_EVENT_COUNTERS */
72
73 /*
74  * Manage combined zone based / global counters
75  *
76  * vm_stat contains the global counters
77  */
78 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
79 EXPORT_SYMBOL(vm_stat);
80
81 #ifdef CONFIG_SMP
82
83 static int calculate_threshold(struct zone *zone)
84 {
85         int threshold;
86         int mem;        /* memory in 128 MB units */
87
88         /*
89          * The threshold scales with the number of processors and the amount
90          * of memory per zone. More memory means that we can defer updates for
91          * longer, more processors could lead to more contention.
92          * fls() is used to have a cheap way of logarithmic scaling.
93          *
94          * Some sample thresholds:
95          *
96          * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
97          * ------------------------------------------------------------------
98          * 8            1               1       0.9-1 GB        4
99          * 16           2               2       0.9-1 GB        4
100          * 20           2               2       1-2 GB          5
101          * 24           2               2       2-4 GB          6
102          * 28           2               2       4-8 GB          7
103          * 32           2               2       8-16 GB         8
104          * 4            2               2       <128M           1
105          * 30           4               3       2-4 GB          5
106          * 48           4               3       8-16 GB         8
107          * 32           8               4       1-2 GB          4
108          * 32           8               4       0.9-1GB         4
109          * 10           16              5       <128M           1
110          * 40           16              5       900M            4
111          * 70           64              7       2-4 GB          5
112          * 84           64              7       4-8 GB          6
113          * 108          512             9       4-8 GB          6
114          * 125          1024            10      8-16 GB         8
115          * 125          1024            10      16-32 GB        9
116          */
117
118         mem = zone->present_pages >> (27 - PAGE_SHIFT);
119
120         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
121
122         /*
123          * Maximum threshold is 125
124          */
125         threshold = min(125, threshold);
126
127         return threshold;
128 }
129
130 /*
131  * Refresh the thresholds for each zone.
132  */
133 static void refresh_zone_stat_thresholds(void)
134 {
135         struct zone *zone;
136         int cpu;
137         int threshold;
138
139         for_each_populated_zone(zone) {
140                 threshold = calculate_threshold(zone);
141
142                 for_each_online_cpu(cpu)
143                         per_cpu_ptr(zone->pageset, cpu)->stat_threshold
144                                                         = threshold;
145         }
146 }
147
148 /*
149  * For use when we know that interrupts are disabled.
150  */
151 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
152                                 int delta)
153 {
154         struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
155
156         s8 *p = pcp->vm_stat_diff + item;
157         long x;
158
159         x = delta + *p;
160
161         if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
162                 zone_page_state_add(x, zone, item);
163                 x = 0;
164         }
165         *p = x;
166 }
167 EXPORT_SYMBOL(__mod_zone_page_state);
168
169 /*
170  * For an unknown interrupt state
171  */
172 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
173                                         int delta)
174 {
175         unsigned long flags;
176
177         local_irq_save(flags);
178         __mod_zone_page_state(zone, item, delta);
179         local_irq_restore(flags);
180 }
181 EXPORT_SYMBOL(mod_zone_page_state);
182
183 /*
184  * Optimized increment and decrement functions.
185  *
186  * These are only for a single page and therefore can take a struct page *
187  * argument instead of struct zone *. This allows the inclusion of the code
188  * generated for page_zone(page) into the optimized functions.
189  *
190  * No overflow check is necessary and therefore the differential can be
191  * incremented or decremented in place which may allow the compilers to
192  * generate better code.
193  * The increment or decrement is known and therefore one boundary check can
194  * be omitted.
195  *
196  * NOTE: These functions are very performance sensitive. Change only
197  * with care.
198  *
199  * Some processors have inc/dec instructions that are atomic vs an interrupt.
200  * However, the code must first determine the differential location in a zone
201  * based on the processor number and then inc/dec the counter. There is no
202  * guarantee without disabling preemption that the processor will not change
203  * in between and therefore the atomicity vs. interrupt cannot be exploited
204  * in a useful way here.
205  */
206 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
207 {
208         struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
209         s8 *p = pcp->vm_stat_diff + item;
210
211         (*p)++;
212
213         if (unlikely(*p > pcp->stat_threshold)) {
214                 int overstep = pcp->stat_threshold / 2;
215
216                 zone_page_state_add(*p + overstep, zone, item);
217                 *p = -overstep;
218         }
219 }
220
221 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
222 {
223         __inc_zone_state(page_zone(page), item);
224 }
225 EXPORT_SYMBOL(__inc_zone_page_state);
226
227 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
228 {
229         struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
230         s8 *p = pcp->vm_stat_diff + item;
231
232         (*p)--;
233
234         if (unlikely(*p < - pcp->stat_threshold)) {
235                 int overstep = pcp->stat_threshold / 2;
236
237                 zone_page_state_add(*p - overstep, zone, item);
238                 *p = overstep;
239         }
240 }
241
242 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
243 {
244         __dec_zone_state(page_zone(page), item);
245 }
246 EXPORT_SYMBOL(__dec_zone_page_state);
247
248 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
249 {
250         unsigned long flags;
251
252         local_irq_save(flags);
253         __inc_zone_state(zone, item);
254         local_irq_restore(flags);
255 }
256
257 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
258 {
259         unsigned long flags;
260         struct zone *zone;
261
262         zone = page_zone(page);
263         local_irq_save(flags);
264         __inc_zone_state(zone, item);
265         local_irq_restore(flags);
266 }
267 EXPORT_SYMBOL(inc_zone_page_state);
268
269 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
270 {
271         unsigned long flags;
272
273         local_irq_save(flags);
274         __dec_zone_page_state(page, item);
275         local_irq_restore(flags);
276 }
277 EXPORT_SYMBOL(dec_zone_page_state);
278
279 /*
280  * Update the zone counters for one cpu.
281  *
282  * The cpu specified must be either the current cpu or a processor that
283  * is not online. If it is the current cpu then the execution thread must
284  * be pinned to the current cpu.
285  *
286  * Note that refresh_cpu_vm_stats strives to only access
287  * node local memory. The per cpu pagesets on remote zones are placed
288  * in the memory local to the processor using that pageset. So the
289  * loop over all zones will access a series of cachelines local to
290  * the processor.
291  *
292  * The call to zone_page_state_add updates the cachelines with the
293  * statistics in the remote zone struct as well as the global cachelines
294  * with the global counters. These could cause remote node cache line
295  * bouncing and will have to be only done when necessary.
296  */
297 void refresh_cpu_vm_stats(int cpu)
298 {
299         struct zone *zone;
300         int i;
301         int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
302
303         for_each_populated_zone(zone) {
304                 struct per_cpu_pageset *p;
305
306                 p = per_cpu_ptr(zone->pageset, cpu);
307
308                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
309                         if (p->vm_stat_diff[i]) {
310                                 unsigned long flags;
311                                 int v;
312
313                                 local_irq_save(flags);
314                                 v = p->vm_stat_diff[i];
315                                 p->vm_stat_diff[i] = 0;
316                                 local_irq_restore(flags);
317                                 atomic_long_add(v, &zone->vm_stat[i]);
318                                 global_diff[i] += v;
319 #ifdef CONFIG_NUMA
320                                 /* 3 seconds idle till flush */
321                                 p->expire = 3;
322 #endif
323                         }
324                 cond_resched();
325 #ifdef CONFIG_NUMA
326                 /*
327                  * Deal with draining the remote pageset of this
328                  * processor
329                  *
330                  * Check if there are pages remaining in this pageset
331                  * if not then there is nothing to expire.
332                  */
333                 if (!p->expire || !p->pcp.count)
334                         continue;
335
336                 /*
337                  * We never drain zones local to this processor.
338                  */
339                 if (zone_to_nid(zone) == numa_node_id()) {
340                         p->expire = 0;
341                         continue;
342                 }
343
344                 p->expire--;
345                 if (p->expire)
346                         continue;
347
348                 if (p->pcp.count)
349                         drain_zone_pages(zone, &p->pcp);
350 #endif
351         }
352
353         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
354                 if (global_diff[i])
355                         atomic_long_add(global_diff[i], &vm_stat[i]);
356 }
357
358 #endif
359
360 #ifdef CONFIG_NUMA
361 /*
362  * zonelist = the list of zones passed to the allocator
363  * z        = the zone from which the allocation occurred.
364  *
365  * Must be called with interrupts disabled.
366  */
367 void zone_statistics(struct zone *preferred_zone, struct zone *z)
368 {
369         if (z->zone_pgdat == preferred_zone->zone_pgdat) {
370                 __inc_zone_state(z, NUMA_HIT);
371         } else {
372                 __inc_zone_state(z, NUMA_MISS);
373                 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
374         }
375         if (z->node == numa_node_id())
376                 __inc_zone_state(z, NUMA_LOCAL);
377         else
378                 __inc_zone_state(z, NUMA_OTHER);
379 }
380 #endif
381
382 #ifdef CONFIG_PROC_FS
383 #include <linux/proc_fs.h>
384 #include <linux/seq_file.h>
385
386 static char * const migratetype_names[MIGRATE_TYPES] = {
387         "Unmovable",
388         "Reclaimable",
389         "Movable",
390         "Reserve",
391         "Isolate",
392 };
393
394 static void *frag_start(struct seq_file *m, loff_t *pos)
395 {
396         pg_data_t *pgdat;
397         loff_t node = *pos;
398         for (pgdat = first_online_pgdat();
399              pgdat && node;
400              pgdat = next_online_pgdat(pgdat))
401                 --node;
402
403         return pgdat;
404 }
405
406 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
407 {
408         pg_data_t *pgdat = (pg_data_t *)arg;
409
410         (*pos)++;
411         return next_online_pgdat(pgdat);
412 }
413
414 static void frag_stop(struct seq_file *m, void *arg)
415 {
416 }
417
418 /* Walk all the zones in a node and print using a callback */
419 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
420                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
421 {
422         struct zone *zone;
423         struct zone *node_zones = pgdat->node_zones;
424         unsigned long flags;
425
426         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
427                 if (!populated_zone(zone))
428                         continue;
429
430                 spin_lock_irqsave(&zone->lock, flags);
431                 print(m, pgdat, zone);
432                 spin_unlock_irqrestore(&zone->lock, flags);
433         }
434 }
435
436 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
437                                                 struct zone *zone)
438 {
439         int order;
440
441         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
442         for (order = 0; order < MAX_ORDER; ++order)
443                 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
444         seq_putc(m, '\n');
445 }
446
447 /*
448  * This walks the free areas for each zone.
449  */
450 static int frag_show(struct seq_file *m, void *arg)
451 {
452         pg_data_t *pgdat = (pg_data_t *)arg;
453         walk_zones_in_node(m, pgdat, frag_show_print);
454         return 0;
455 }
456
457 static void pagetypeinfo_showfree_print(struct seq_file *m,
458                                         pg_data_t *pgdat, struct zone *zone)
459 {
460         int order, mtype;
461
462         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
463                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
464                                         pgdat->node_id,
465                                         zone->name,
466                                         migratetype_names[mtype]);
467                 for (order = 0; order < MAX_ORDER; ++order) {
468                         unsigned long freecount = 0;
469                         struct free_area *area;
470                         struct list_head *curr;
471
472                         area = &(zone->free_area[order]);
473
474                         list_for_each(curr, &area->free_list[mtype])
475                                 freecount++;
476                         seq_printf(m, "%6lu ", freecount);
477                 }
478                 seq_putc(m, '\n');
479         }
480 }
481
482 /* Print out the free pages at each order for each migatetype */
483 static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
484 {
485         int order;
486         pg_data_t *pgdat = (pg_data_t *)arg;
487
488         /* Print header */
489         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
490         for (order = 0; order < MAX_ORDER; ++order)
491                 seq_printf(m, "%6d ", order);
492         seq_putc(m, '\n');
493
494         walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
495
496         return 0;
497 }
498
499 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
500                                         pg_data_t *pgdat, struct zone *zone)
501 {
502         int mtype;
503         unsigned long pfn;
504         unsigned long start_pfn = zone->zone_start_pfn;
505         unsigned long end_pfn = start_pfn + zone->spanned_pages;
506         unsigned long count[MIGRATE_TYPES] = { 0, };
507
508         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
509                 struct page *page;
510
511                 if (!pfn_valid(pfn))
512                         continue;
513
514                 page = pfn_to_page(pfn);
515
516                 /* Watch for unexpected holes punched in the memmap */
517                 if (!memmap_valid_within(pfn, page, zone))
518                         continue;
519
520                 mtype = get_pageblock_migratetype(page);
521
522                 if (mtype < MIGRATE_TYPES)
523                         count[mtype]++;
524         }
525
526         /* Print counts */
527         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
528         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
529                 seq_printf(m, "%12lu ", count[mtype]);
530         seq_putc(m, '\n');
531 }
532
533 /* Print out the free pages at each order for each migratetype */
534 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
535 {
536         int mtype;
537         pg_data_t *pgdat = (pg_data_t *)arg;
538
539         seq_printf(m, "\n%-23s", "Number of blocks type ");
540         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
541                 seq_printf(m, "%12s ", migratetype_names[mtype]);
542         seq_putc(m, '\n');
543         walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
544
545         return 0;
546 }
547
548 /*
549  * This prints out statistics in relation to grouping pages by mobility.
550  * It is expensive to collect so do not constantly read the file.
551  */
552 static int pagetypeinfo_show(struct seq_file *m, void *arg)
553 {
554         pg_data_t *pgdat = (pg_data_t *)arg;
555
556         /* check memoryless node */
557         if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
558                 return 0;
559
560         seq_printf(m, "Page block order: %d\n", pageblock_order);
561         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
562         seq_putc(m, '\n');
563         pagetypeinfo_showfree(m, pgdat);
564         pagetypeinfo_showblockcount(m, pgdat);
565
566         return 0;
567 }
568
569 static const struct seq_operations fragmentation_op = {
570         .start  = frag_start,
571         .next   = frag_next,
572         .stop   = frag_stop,
573         .show   = frag_show,
574 };
575
576 static int fragmentation_open(struct inode *inode, struct file *file)
577 {
578         return seq_open(file, &fragmentation_op);
579 }
580
581 static const struct file_operations fragmentation_file_operations = {
582         .open           = fragmentation_open,
583         .read           = seq_read,
584         .llseek         = seq_lseek,
585         .release        = seq_release,
586 };
587
588 static const struct seq_operations pagetypeinfo_op = {
589         .start  = frag_start,
590         .next   = frag_next,
591         .stop   = frag_stop,
592         .show   = pagetypeinfo_show,
593 };
594
595 static int pagetypeinfo_open(struct inode *inode, struct file *file)
596 {
597         return seq_open(file, &pagetypeinfo_op);
598 }
599
600 static const struct file_operations pagetypeinfo_file_ops = {
601         .open           = pagetypeinfo_open,
602         .read           = seq_read,
603         .llseek         = seq_lseek,
604         .release        = seq_release,
605 };
606
607 #ifdef CONFIG_ZONE_DMA
608 #define TEXT_FOR_DMA(xx) xx "_dma",
609 #else
610 #define TEXT_FOR_DMA(xx)
611 #endif
612
613 #ifdef CONFIG_ZONE_DMA32
614 #define TEXT_FOR_DMA32(xx) xx "_dma32",
615 #else
616 #define TEXT_FOR_DMA32(xx)
617 #endif
618
619 #ifdef CONFIG_HIGHMEM
620 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
621 #else
622 #define TEXT_FOR_HIGHMEM(xx)
623 #endif
624
625 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
626                                         TEXT_FOR_HIGHMEM(xx) xx "_movable",
627
628 static const char * const vmstat_text[] = {
629         /* Zoned VM counters */
630         "nr_free_pages",
631         "nr_inactive_anon",
632         "nr_active_anon",
633         "nr_inactive_file",
634         "nr_active_file",
635         "nr_unevictable",
636         "nr_mlock",
637         "nr_anon_pages",
638         "nr_mapped",
639         "nr_file_pages",
640         "nr_dirty",
641         "nr_writeback",
642         "nr_slab_reclaimable",
643         "nr_slab_unreclaimable",
644         "nr_page_table_pages",
645         "nr_kernel_stack",
646         "nr_unstable",
647         "nr_bounce",
648         "nr_vmscan_write",
649         "nr_writeback_temp",
650         "nr_isolated_anon",
651         "nr_isolated_file",
652         "nr_shmem",
653 #ifdef CONFIG_NUMA
654         "numa_hit",
655         "numa_miss",
656         "numa_foreign",
657         "numa_interleave",
658         "numa_local",
659         "numa_other",
660 #endif
661
662 #ifdef CONFIG_VM_EVENT_COUNTERS
663         "pgpgin",
664         "pgpgout",
665         "pswpin",
666         "pswpout",
667
668         TEXTS_FOR_ZONES("pgalloc")
669
670         "pgfree",
671         "pgactivate",
672         "pgdeactivate",
673
674         "pgfault",
675         "pgmajfault",
676
677         TEXTS_FOR_ZONES("pgrefill")
678         TEXTS_FOR_ZONES("pgsteal")
679         TEXTS_FOR_ZONES("pgscan_kswapd")
680         TEXTS_FOR_ZONES("pgscan_direct")
681
682 #ifdef CONFIG_NUMA
683         "zone_reclaim_failed",
684 #endif
685         "pginodesteal",
686         "slabs_scanned",
687         "kswapd_steal",
688         "kswapd_inodesteal",
689         "kswapd_low_wmark_hit_quickly",
690         "kswapd_high_wmark_hit_quickly",
691         "kswapd_skip_congestion_wait",
692         "pageoutrun",
693         "allocstall",
694
695         "pgrotated",
696 #ifdef CONFIG_HUGETLB_PAGE
697         "htlb_buddy_alloc_success",
698         "htlb_buddy_alloc_fail",
699 #endif
700         "unevictable_pgs_culled",
701         "unevictable_pgs_scanned",
702         "unevictable_pgs_rescued",
703         "unevictable_pgs_mlocked",
704         "unevictable_pgs_munlocked",
705         "unevictable_pgs_cleared",
706         "unevictable_pgs_stranded",
707         "unevictable_pgs_mlockfreed",
708 #endif
709 };
710
711 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
712                                                         struct zone *zone)
713 {
714         int i;
715         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
716         seq_printf(m,
717                    "\n  pages free     %lu"
718                    "\n        min      %lu"
719                    "\n        low      %lu"
720                    "\n        high     %lu"
721                    "\n        scanned  %lu"
722                    "\n        spanned  %lu"
723                    "\n        present  %lu",
724                    zone_page_state(zone, NR_FREE_PAGES),
725                    min_wmark_pages(zone),
726                    low_wmark_pages(zone),
727                    high_wmark_pages(zone),
728                    zone->pages_scanned,
729                    zone->spanned_pages,
730                    zone->present_pages);
731
732         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
733                 seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
734                                 zone_page_state(zone, i));
735
736         seq_printf(m,
737                    "\n        protection: (%lu",
738                    zone->lowmem_reserve[0]);
739         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
740                 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
741         seq_printf(m,
742                    ")"
743                    "\n  pagesets");
744         for_each_online_cpu(i) {
745                 struct per_cpu_pageset *pageset;
746
747                 pageset = per_cpu_ptr(zone->pageset, i);
748                 seq_printf(m,
749                            "\n    cpu: %i"
750                            "\n              count: %i"
751                            "\n              high:  %i"
752                            "\n              batch: %i",
753                            i,
754                            pageset->pcp.count,
755                            pageset->pcp.high,
756                            pageset->pcp.batch);
757 #ifdef CONFIG_SMP
758                 seq_printf(m, "\n  vm stats threshold: %d",
759                                 pageset->stat_threshold);
760 #endif
761         }
762         seq_printf(m,
763                    "\n  all_unreclaimable: %u"
764                    "\n  prev_priority:     %i"
765                    "\n  start_pfn:         %lu"
766                    "\n  inactive_ratio:    %u",
767                    zone->all_unreclaimable,
768                    zone->prev_priority,
769                    zone->zone_start_pfn,
770                    zone->inactive_ratio);
771         seq_putc(m, '\n');
772 }
773
774 /*
775  * Output information about zones in @pgdat.
776  */
777 static int zoneinfo_show(struct seq_file *m, void *arg)
778 {
779         pg_data_t *pgdat = (pg_data_t *)arg;
780         walk_zones_in_node(m, pgdat, zoneinfo_show_print);
781         return 0;
782 }
783
784 static const struct seq_operations zoneinfo_op = {
785         .start  = frag_start, /* iterate over all zones. The same as in
786                                * fragmentation. */
787         .next   = frag_next,
788         .stop   = frag_stop,
789         .show   = zoneinfo_show,
790 };
791
792 static int zoneinfo_open(struct inode *inode, struct file *file)
793 {
794         return seq_open(file, &zoneinfo_op);
795 }
796
797 static const struct file_operations proc_zoneinfo_file_operations = {
798         .open           = zoneinfo_open,
799         .read           = seq_read,
800         .llseek         = seq_lseek,
801         .release        = seq_release,
802 };
803
804 static void *vmstat_start(struct seq_file *m, loff_t *pos)
805 {
806         unsigned long *v;
807 #ifdef CONFIG_VM_EVENT_COUNTERS
808         unsigned long *e;
809 #endif
810         int i;
811
812         if (*pos >= ARRAY_SIZE(vmstat_text))
813                 return NULL;
814
815 #ifdef CONFIG_VM_EVENT_COUNTERS
816         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
817                         + sizeof(struct vm_event_state), GFP_KERNEL);
818 #else
819         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
820                         GFP_KERNEL);
821 #endif
822         m->private = v;
823         if (!v)
824                 return ERR_PTR(-ENOMEM);
825         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
826                 v[i] = global_page_state(i);
827 #ifdef CONFIG_VM_EVENT_COUNTERS
828         e = v + NR_VM_ZONE_STAT_ITEMS;
829         all_vm_events(e);
830         e[PGPGIN] /= 2;         /* sectors -> kbytes */
831         e[PGPGOUT] /= 2;
832 #endif
833         return v + *pos;
834 }
835
836 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
837 {
838         (*pos)++;
839         if (*pos >= ARRAY_SIZE(vmstat_text))
840                 return NULL;
841         return (unsigned long *)m->private + *pos;
842 }
843
844 static int vmstat_show(struct seq_file *m, void *arg)
845 {
846         unsigned long *l = arg;
847         unsigned long off = l - (unsigned long *)m->private;
848
849         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
850         return 0;
851 }
852
853 static void vmstat_stop(struct seq_file *m, void *arg)
854 {
855         kfree(m->private);
856         m->private = NULL;
857 }
858
859 static const struct seq_operations vmstat_op = {
860         .start  = vmstat_start,
861         .next   = vmstat_next,
862         .stop   = vmstat_stop,
863         .show   = vmstat_show,
864 };
865
866 static int vmstat_open(struct inode *inode, struct file *file)
867 {
868         return seq_open(file, &vmstat_op);
869 }
870
871 static const struct file_operations proc_vmstat_file_operations = {
872         .open           = vmstat_open,
873         .read           = seq_read,
874         .llseek         = seq_lseek,
875         .release        = seq_release,
876 };
877 #endif /* CONFIG_PROC_FS */
878
879 #ifdef CONFIG_SMP
880 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
881 int sysctl_stat_interval __read_mostly = HZ;
882
883 static void vmstat_update(struct work_struct *w)
884 {
885         refresh_cpu_vm_stats(smp_processor_id());
886         schedule_delayed_work(&__get_cpu_var(vmstat_work),
887                 round_jiffies_relative(sysctl_stat_interval));
888 }
889
890 static void __cpuinit start_cpu_timer(int cpu)
891 {
892         struct delayed_work *work = &per_cpu(vmstat_work, cpu);
893
894         INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
895         schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
896 }
897
898 /*
899  * Use the cpu notifier to insure that the thresholds are recalculated
900  * when necessary.
901  */
902 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
903                 unsigned long action,
904                 void *hcpu)
905 {
906         long cpu = (long)hcpu;
907
908         switch (action) {
909         case CPU_ONLINE:
910         case CPU_ONLINE_FROZEN:
911                 start_cpu_timer(cpu);
912                 node_set_state(cpu_to_node(cpu), N_CPU);
913                 break;
914         case CPU_DOWN_PREPARE:
915         case CPU_DOWN_PREPARE_FROZEN:
916                 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
917                 per_cpu(vmstat_work, cpu).work.func = NULL;
918                 break;
919         case CPU_DOWN_FAILED:
920         case CPU_DOWN_FAILED_FROZEN:
921                 start_cpu_timer(cpu);
922                 break;
923         case CPU_DEAD:
924         case CPU_DEAD_FROZEN:
925                 refresh_zone_stat_thresholds();
926                 break;
927         default:
928                 break;
929         }
930         return NOTIFY_OK;
931 }
932
933 static struct notifier_block __cpuinitdata vmstat_notifier =
934         { &vmstat_cpuup_callback, NULL, 0 };
935 #endif
936
937 static int __init setup_vmstat(void)
938 {
939 #ifdef CONFIG_SMP
940         int cpu;
941
942         refresh_zone_stat_thresholds();
943         register_cpu_notifier(&vmstat_notifier);
944
945         for_each_online_cpu(cpu)
946                 start_cpu_timer(cpu);
947 #endif
948 #ifdef CONFIG_PROC_FS
949         proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
950         proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
951         proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
952         proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
953 #endif
954         return 0;
955 }
956 module_init(setup_vmstat)