[PATCH] zoned vm counters: conversion of nr_unstable to per zone counter
[linux-2.6.git] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11
12 #include <linux/config.h>
13 #include <linux/mm.h>
14 #include <linux/module.h>
15
16 /*
17  * Accumulate the page_state information across all CPUs.
18  * The result is unavoidably approximate - it can change
19  * during and after execution of this function.
20  */
21 DEFINE_PER_CPU(struct page_state, page_states) = {0};
22
23 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
24 {
25         unsigned cpu;
26
27         memset(ret, 0, nr * sizeof(unsigned long));
28         cpus_and(*cpumask, *cpumask, cpu_online_map);
29
30         for_each_cpu_mask(cpu, *cpumask) {
31                 unsigned long *in;
32                 unsigned long *out;
33                 unsigned off;
34                 unsigned next_cpu;
35
36                 in = (unsigned long *)&per_cpu(page_states, cpu);
37
38                 next_cpu = next_cpu(cpu, *cpumask);
39                 if (likely(next_cpu < NR_CPUS))
40                         prefetch(&per_cpu(page_states, next_cpu));
41
42                 out = (unsigned long *)ret;
43                 for (off = 0; off < nr; off++)
44                         *out++ += *in++;
45         }
46 }
47
48 void get_full_page_state(struct page_state *ret)
49 {
50         cpumask_t mask = CPU_MASK_ALL;
51
52         __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
53 }
54
55 unsigned long read_page_state_offset(unsigned long offset)
56 {
57         unsigned long ret = 0;
58         int cpu;
59
60         for_each_online_cpu(cpu) {
61                 unsigned long in;
62
63                 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
64                 ret += *((unsigned long *)in);
65         }
66         return ret;
67 }
68
69 void __mod_page_state_offset(unsigned long offset, unsigned long delta)
70 {
71         void *ptr;
72
73         ptr = &__get_cpu_var(page_states);
74         *(unsigned long *)(ptr + offset) += delta;
75 }
76 EXPORT_SYMBOL(__mod_page_state_offset);
77
78 void mod_page_state_offset(unsigned long offset, unsigned long delta)
79 {
80         unsigned long flags;
81         void *ptr;
82
83         local_irq_save(flags);
84         ptr = &__get_cpu_var(page_states);
85         *(unsigned long *)(ptr + offset) += delta;
86         local_irq_restore(flags);
87 }
88 EXPORT_SYMBOL(mod_page_state_offset);
89
90 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
91                         unsigned long *free, struct pglist_data *pgdat)
92 {
93         struct zone *zones = pgdat->node_zones;
94         int i;
95
96         *active = 0;
97         *inactive = 0;
98         *free = 0;
99         for (i = 0; i < MAX_NR_ZONES; i++) {
100                 *active += zones[i].nr_active;
101                 *inactive += zones[i].nr_inactive;
102                 *free += zones[i].free_pages;
103         }
104 }
105
106 void get_zone_counts(unsigned long *active,
107                 unsigned long *inactive, unsigned long *free)
108 {
109         struct pglist_data *pgdat;
110
111         *active = 0;
112         *inactive = 0;
113         *free = 0;
114         for_each_online_pgdat(pgdat) {
115                 unsigned long l, m, n;
116                 __get_zone_counts(&l, &m, &n, pgdat);
117                 *active += l;
118                 *inactive += m;
119                 *free += n;
120         }
121 }
122
123 /*
124  * Manage combined zone based / global counters
125  *
126  * vm_stat contains the global counters
127  */
128 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
129 EXPORT_SYMBOL(vm_stat);
130
131 #ifdef CONFIG_SMP
132
133 #define STAT_THRESHOLD 32
134
135 /*
136  * Determine pointer to currently valid differential byte given a zone and
137  * the item number.
138  *
139  * Preemption must be off
140  */
141 static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
142 {
143         return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
144 }
145
146 /*
147  * For use when we know that interrupts are disabled.
148  */
149 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150                                 int delta)
151 {
152         s8 *p;
153         long x;
154
155         p = diff_pointer(zone, item);
156         x = delta + *p;
157
158         if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
159                 zone_page_state_add(x, zone, item);
160                 x = 0;
161         }
162
163         *p = x;
164 }
165 EXPORT_SYMBOL(__mod_zone_page_state);
166
167 /*
168  * For an unknown interrupt state
169  */
170 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
171                                         int delta)
172 {
173         unsigned long flags;
174
175         local_irq_save(flags);
176         __mod_zone_page_state(zone, item, delta);
177         local_irq_restore(flags);
178 }
179 EXPORT_SYMBOL(mod_zone_page_state);
180
181 /*
182  * Optimized increment and decrement functions.
183  *
184  * These are only for a single page and therefore can take a struct page *
185  * argument instead of struct zone *. This allows the inclusion of the code
186  * generated for page_zone(page) into the optimized functions.
187  *
188  * No overflow check is necessary and therefore the differential can be
189  * incremented or decremented in place which may allow the compilers to
190  * generate better code.
191  *
192  * The increment or decrement is known and therefore one boundary check can
193  * be omitted.
194  *
195  * Some processors have inc/dec instructions that are atomic vs an interrupt.
196  * However, the code must first determine the differential location in a zone
197  * based on the processor number and then inc/dec the counter. There is no
198  * guarantee without disabling preemption that the processor will not change
199  * in between and therefore the atomicity vs. interrupt cannot be exploited
200  * in a useful way here.
201  */
202 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
203 {
204         struct zone *zone = page_zone(page);
205         s8 *p = diff_pointer(zone, item);
206
207         (*p)++;
208
209         if (unlikely(*p > STAT_THRESHOLD)) {
210                 zone_page_state_add(*p, zone, item);
211                 *p = 0;
212         }
213 }
214 EXPORT_SYMBOL(__inc_zone_page_state);
215
216 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
217 {
218         struct zone *zone = page_zone(page);
219         s8 *p = diff_pointer(zone, item);
220
221         (*p)--;
222
223         if (unlikely(*p < -STAT_THRESHOLD)) {
224                 zone_page_state_add(*p, zone, item);
225                 *p = 0;
226         }
227 }
228 EXPORT_SYMBOL(__dec_zone_page_state);
229
230 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
231 {
232         unsigned long flags;
233         struct zone *zone;
234         s8 *p;
235
236         zone = page_zone(page);
237         local_irq_save(flags);
238         p = diff_pointer(zone, item);
239
240         (*p)++;
241
242         if (unlikely(*p > STAT_THRESHOLD)) {
243                 zone_page_state_add(*p, zone, item);
244                 *p = 0;
245         }
246         local_irq_restore(flags);
247 }
248 EXPORT_SYMBOL(inc_zone_page_state);
249
250 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
251 {
252         unsigned long flags;
253         struct zone *zone;
254         s8 *p;
255
256         zone = page_zone(page);
257         local_irq_save(flags);
258         p = diff_pointer(zone, item);
259
260         (*p)--;
261
262         if (unlikely(*p < -STAT_THRESHOLD)) {
263                 zone_page_state_add(*p, zone, item);
264                 *p = 0;
265         }
266         local_irq_restore(flags);
267 }
268 EXPORT_SYMBOL(dec_zone_page_state);
269
270 /*
271  * Update the zone counters for one cpu.
272  */
273 void refresh_cpu_vm_stats(int cpu)
274 {
275         struct zone *zone;
276         int i;
277         unsigned long flags;
278
279         for_each_zone(zone) {
280                 struct per_cpu_pageset *pcp;
281
282                 pcp = zone_pcp(zone, cpu);
283
284                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
285                         if (pcp->vm_stat_diff[i]) {
286                                 local_irq_save(flags);
287                                 zone_page_state_add(pcp->vm_stat_diff[i],
288                                         zone, i);
289                                 pcp->vm_stat_diff[i] = 0;
290                                 local_irq_restore(flags);
291                         }
292         }
293 }
294
295 static void __refresh_cpu_vm_stats(void *dummy)
296 {
297         refresh_cpu_vm_stats(smp_processor_id());
298 }
299
300 /*
301  * Consolidate all counters.
302  *
303  * Note that the result is less inaccurate but still inaccurate
304  * if concurrent processes are allowed to run.
305  */
306 void refresh_vm_stats(void)
307 {
308         on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
309 }
310 EXPORT_SYMBOL(refresh_vm_stats);
311
312 #endif
313
314 #ifdef CONFIG_PROC_FS
315
316 #include <linux/seq_file.h>
317
318 static void *frag_start(struct seq_file *m, loff_t *pos)
319 {
320         pg_data_t *pgdat;
321         loff_t node = *pos;
322         for (pgdat = first_online_pgdat();
323              pgdat && node;
324              pgdat = next_online_pgdat(pgdat))
325                 --node;
326
327         return pgdat;
328 }
329
330 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
331 {
332         pg_data_t *pgdat = (pg_data_t *)arg;
333
334         (*pos)++;
335         return next_online_pgdat(pgdat);
336 }
337
338 static void frag_stop(struct seq_file *m, void *arg)
339 {
340 }
341
342 /*
343  * This walks the free areas for each zone.
344  */
345 static int frag_show(struct seq_file *m, void *arg)
346 {
347         pg_data_t *pgdat = (pg_data_t *)arg;
348         struct zone *zone;
349         struct zone *node_zones = pgdat->node_zones;
350         unsigned long flags;
351         int order;
352
353         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
354                 if (!populated_zone(zone))
355                         continue;
356
357                 spin_lock_irqsave(&zone->lock, flags);
358                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
359                 for (order = 0; order < MAX_ORDER; ++order)
360                         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
361                 spin_unlock_irqrestore(&zone->lock, flags);
362                 seq_putc(m, '\n');
363         }
364         return 0;
365 }
366
367 struct seq_operations fragmentation_op = {
368         .start  = frag_start,
369         .next   = frag_next,
370         .stop   = frag_stop,
371         .show   = frag_show,
372 };
373
374 static char *vmstat_text[] = {
375         /* Zoned VM counters */
376         "nr_anon_pages",
377         "nr_mapped",
378         "nr_file_pages",
379         "nr_slab",
380         "nr_page_table_pages",
381         "nr_dirty",
382         "nr_writeback",
383         "nr_unstable",
384
385         /* Event counters */
386         "pgpgin",
387         "pgpgout",
388         "pswpin",
389         "pswpout",
390
391         "pgalloc_high",
392         "pgalloc_normal",
393         "pgalloc_dma32",
394         "pgalloc_dma",
395
396         "pgfree",
397         "pgactivate",
398         "pgdeactivate",
399
400         "pgfault",
401         "pgmajfault",
402
403         "pgrefill_high",
404         "pgrefill_normal",
405         "pgrefill_dma32",
406         "pgrefill_dma",
407
408         "pgsteal_high",
409         "pgsteal_normal",
410         "pgsteal_dma32",
411         "pgsteal_dma",
412
413         "pgscan_kswapd_high",
414         "pgscan_kswapd_normal",
415         "pgscan_kswapd_dma32",
416         "pgscan_kswapd_dma",
417
418         "pgscan_direct_high",
419         "pgscan_direct_normal",
420         "pgscan_direct_dma32",
421         "pgscan_direct_dma",
422
423         "pginodesteal",
424         "slabs_scanned",
425         "kswapd_steal",
426         "kswapd_inodesteal",
427         "pageoutrun",
428         "allocstall",
429
430         "pgrotated",
431         "nr_bounce",
432 };
433
434 /*
435  * Output information about zones in @pgdat.
436  */
437 static int zoneinfo_show(struct seq_file *m, void *arg)
438 {
439         pg_data_t *pgdat = arg;
440         struct zone *zone;
441         struct zone *node_zones = pgdat->node_zones;
442         unsigned long flags;
443
444         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
445                 int i;
446
447                 if (!populated_zone(zone))
448                         continue;
449
450                 spin_lock_irqsave(&zone->lock, flags);
451                 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
452                 seq_printf(m,
453                            "\n  pages free     %lu"
454                            "\n        min      %lu"
455                            "\n        low      %lu"
456                            "\n        high     %lu"
457                            "\n        active   %lu"
458                            "\n        inactive %lu"
459                            "\n        scanned  %lu (a: %lu i: %lu)"
460                            "\n        spanned  %lu"
461                            "\n        present  %lu",
462                            zone->free_pages,
463                            zone->pages_min,
464                            zone->pages_low,
465                            zone->pages_high,
466                            zone->nr_active,
467                            zone->nr_inactive,
468                            zone->pages_scanned,
469                            zone->nr_scan_active, zone->nr_scan_inactive,
470                            zone->spanned_pages,
471                            zone->present_pages);
472
473                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
474                         seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
475                                         zone_page_state(zone, i));
476
477                 seq_printf(m,
478                            "\n        protection: (%lu",
479                            zone->lowmem_reserve[0]);
480                 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
481                         seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
482                 seq_printf(m,
483                            ")"
484                            "\n  pagesets");
485                 for_each_online_cpu(i) {
486                         struct per_cpu_pageset *pageset;
487                         int j;
488
489                         pageset = zone_pcp(zone, i);
490                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
491                                 if (pageset->pcp[j].count)
492                                         break;
493                         }
494                         if (j == ARRAY_SIZE(pageset->pcp))
495                                 continue;
496                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
497                                 seq_printf(m,
498                                            "\n    cpu: %i pcp: %i"
499                                            "\n              count: %i"
500                                            "\n              high:  %i"
501                                            "\n              batch: %i",
502                                            i, j,
503                                            pageset->pcp[j].count,
504                                            pageset->pcp[j].high,
505                                            pageset->pcp[j].batch);
506                         }
507 #ifdef CONFIG_NUMA
508                         seq_printf(m,
509                                    "\n            numa_hit:       %lu"
510                                    "\n            numa_miss:      %lu"
511                                    "\n            numa_foreign:   %lu"
512                                    "\n            interleave_hit: %lu"
513                                    "\n            local_node:     %lu"
514                                    "\n            other_node:     %lu",
515                                    pageset->numa_hit,
516                                    pageset->numa_miss,
517                                    pageset->numa_foreign,
518                                    pageset->interleave_hit,
519                                    pageset->local_node,
520                                    pageset->other_node);
521 #endif
522                 }
523                 seq_printf(m,
524                            "\n  all_unreclaimable: %u"
525                            "\n  prev_priority:     %i"
526                            "\n  temp_priority:     %i"
527                            "\n  start_pfn:         %lu",
528                            zone->all_unreclaimable,
529                            zone->prev_priority,
530                            zone->temp_priority,
531                            zone->zone_start_pfn);
532                 spin_unlock_irqrestore(&zone->lock, flags);
533                 seq_putc(m, '\n');
534         }
535         return 0;
536 }
537
538 struct seq_operations zoneinfo_op = {
539         .start  = frag_start, /* iterate over all zones. The same as in
540                                * fragmentation. */
541         .next   = frag_next,
542         .stop   = frag_stop,
543         .show   = zoneinfo_show,
544 };
545
546 static void *vmstat_start(struct seq_file *m, loff_t *pos)
547 {
548         unsigned long *v;
549         struct page_state *ps;
550         int i;
551
552         if (*pos >= ARRAY_SIZE(vmstat_text))
553                 return NULL;
554
555         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
556                         + sizeof(*ps), GFP_KERNEL);
557         m->private = v;
558         if (!v)
559                 return ERR_PTR(-ENOMEM);
560         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
561                 v[i] = global_page_state(i);
562         ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
563         get_full_page_state(ps);
564         ps->pgpgin /= 2;                /* sectors -> kbytes */
565         ps->pgpgout /= 2;
566         return v + *pos;
567 }
568
569 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
570 {
571         (*pos)++;
572         if (*pos >= ARRAY_SIZE(vmstat_text))
573                 return NULL;
574         return (unsigned long *)m->private + *pos;
575 }
576
577 static int vmstat_show(struct seq_file *m, void *arg)
578 {
579         unsigned long *l = arg;
580         unsigned long off = l - (unsigned long *)m->private;
581
582         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
583         return 0;
584 }
585
586 static void vmstat_stop(struct seq_file *m, void *arg)
587 {
588         kfree(m->private);
589         m->private = NULL;
590 }
591
592 struct seq_operations vmstat_op = {
593         .start  = vmstat_start,
594         .next   = vmstat_next,
595         .stop   = vmstat_stop,
596         .show   = vmstat_show,
597 };
598
599 #endif /* CONFIG_PROC_FS */
600