[PATCH] Light weight event counters
[linux-2.6.git] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11
12 #include <linux/config.h>
13 #include <linux/mm.h>
14 #include <linux/module.h>
15
16 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17                         unsigned long *free, struct pglist_data *pgdat)
18 {
19         struct zone *zones = pgdat->node_zones;
20         int i;
21
22         *active = 0;
23         *inactive = 0;
24         *free = 0;
25         for (i = 0; i < MAX_NR_ZONES; i++) {
26                 *active += zones[i].nr_active;
27                 *inactive += zones[i].nr_inactive;
28                 *free += zones[i].free_pages;
29         }
30 }
31
32 void get_zone_counts(unsigned long *active,
33                 unsigned long *inactive, unsigned long *free)
34 {
35         struct pglist_data *pgdat;
36
37         *active = 0;
38         *inactive = 0;
39         *free = 0;
40         for_each_online_pgdat(pgdat) {
41                 unsigned long l, m, n;
42                 __get_zone_counts(&l, &m, &n, pgdat);
43                 *active += l;
44                 *inactive += m;
45                 *free += n;
46         }
47 }
48
49 #ifdef CONFIG_VM_EVENT_COUNTERS
50 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
51 EXPORT_PER_CPU_SYMBOL(vm_event_states);
52
53 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
54 {
55         int cpu = 0;
56         int i;
57
58         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
59
60         cpu = first_cpu(*cpumask);
61         while (cpu < NR_CPUS) {
62                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
63
64                 cpu = next_cpu(cpu, *cpumask);
65
66                 if (cpu < NR_CPUS)
67                         prefetch(&per_cpu(vm_event_states, cpu));
68
69
70                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
71                         ret[i] += this->event[i];
72         }
73 }
74
75 /*
76  * Accumulate the vm event counters across all CPUs.
77  * The result is unavoidably approximate - it can change
78  * during and after execution of this function.
79 */
80 void all_vm_events(unsigned long *ret)
81 {
82         sum_vm_events(ret, &cpu_online_map);
83 }
84
85 #ifdef CONFIG_HOTPLUG
86 /*
87  * Fold the foreign cpu events into our own.
88  *
89  * This is adding to the events on one processor
90  * but keeps the global counts constant.
91  */
92 void vm_events_fold_cpu(int cpu)
93 {
94         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
95         int i;
96
97         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
98                 count_vm_events(i, fold_state->event[i]);
99                 fold_state->event[i] = 0;
100         }
101 }
102 #endif /* CONFIG_HOTPLUG */
103
104 #endif /* CONFIG_VM_EVENT_COUNTERS */
105
106 /*
107  * Manage combined zone based / global counters
108  *
109  * vm_stat contains the global counters
110  */
111 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
112 EXPORT_SYMBOL(vm_stat);
113
114 #ifdef CONFIG_SMP
115
116 #define STAT_THRESHOLD 32
117
118 /*
119  * Determine pointer to currently valid differential byte given a zone and
120  * the item number.
121  *
122  * Preemption must be off
123  */
124 static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
125 {
126         return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
127 }
128
129 /*
130  * For use when we know that interrupts are disabled.
131  */
132 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
133                                 int delta)
134 {
135         s8 *p;
136         long x;
137
138         p = diff_pointer(zone, item);
139         x = delta + *p;
140
141         if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
142                 zone_page_state_add(x, zone, item);
143                 x = 0;
144         }
145
146         *p = x;
147 }
148 EXPORT_SYMBOL(__mod_zone_page_state);
149
150 /*
151  * For an unknown interrupt state
152  */
153 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
154                                         int delta)
155 {
156         unsigned long flags;
157
158         local_irq_save(flags);
159         __mod_zone_page_state(zone, item, delta);
160         local_irq_restore(flags);
161 }
162 EXPORT_SYMBOL(mod_zone_page_state);
163
164 /*
165  * Optimized increment and decrement functions.
166  *
167  * These are only for a single page and therefore can take a struct page *
168  * argument instead of struct zone *. This allows the inclusion of the code
169  * generated for page_zone(page) into the optimized functions.
170  *
171  * No overflow check is necessary and therefore the differential can be
172  * incremented or decremented in place which may allow the compilers to
173  * generate better code.
174  *
175  * The increment or decrement is known and therefore one boundary check can
176  * be omitted.
177  *
178  * Some processors have inc/dec instructions that are atomic vs an interrupt.
179  * However, the code must first determine the differential location in a zone
180  * based on the processor number and then inc/dec the counter. There is no
181  * guarantee without disabling preemption that the processor will not change
182  * in between and therefore the atomicity vs. interrupt cannot be exploited
183  * in a useful way here.
184  */
185 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
186 {
187         s8 *p = diff_pointer(zone, item);
188
189         (*p)++;
190
191         if (unlikely(*p > STAT_THRESHOLD)) {
192                 zone_page_state_add(*p, zone, item);
193                 *p = 0;
194         }
195 }
196
197 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
198 {
199         __inc_zone_state(page_zone(page), item);
200 }
201 EXPORT_SYMBOL(__inc_zone_page_state);
202
203 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
204 {
205         struct zone *zone = page_zone(page);
206         s8 *p = diff_pointer(zone, item);
207
208         (*p)--;
209
210         if (unlikely(*p < -STAT_THRESHOLD)) {
211                 zone_page_state_add(*p, zone, item);
212                 *p = 0;
213         }
214 }
215 EXPORT_SYMBOL(__dec_zone_page_state);
216
217 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
218 {
219         unsigned long flags;
220
221         local_irq_save(flags);
222         __inc_zone_state(zone, item);
223         local_irq_restore(flags);
224 }
225
226 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
227 {
228         unsigned long flags;
229         struct zone *zone;
230
231         zone = page_zone(page);
232         local_irq_save(flags);
233         __inc_zone_state(zone, item);
234         local_irq_restore(flags);
235 }
236 EXPORT_SYMBOL(inc_zone_page_state);
237
238 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
239 {
240         unsigned long flags;
241         struct zone *zone;
242         s8 *p;
243
244         zone = page_zone(page);
245         local_irq_save(flags);
246         p = diff_pointer(zone, item);
247
248         (*p)--;
249
250         if (unlikely(*p < -STAT_THRESHOLD)) {
251                 zone_page_state_add(*p, zone, item);
252                 *p = 0;
253         }
254         local_irq_restore(flags);
255 }
256 EXPORT_SYMBOL(dec_zone_page_state);
257
258 /*
259  * Update the zone counters for one cpu.
260  */
261 void refresh_cpu_vm_stats(int cpu)
262 {
263         struct zone *zone;
264         int i;
265         unsigned long flags;
266
267         for_each_zone(zone) {
268                 struct per_cpu_pageset *pcp;
269
270                 pcp = zone_pcp(zone, cpu);
271
272                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
273                         if (pcp->vm_stat_diff[i]) {
274                                 local_irq_save(flags);
275                                 zone_page_state_add(pcp->vm_stat_diff[i],
276                                         zone, i);
277                                 pcp->vm_stat_diff[i] = 0;
278                                 local_irq_restore(flags);
279                         }
280         }
281 }
282
283 static void __refresh_cpu_vm_stats(void *dummy)
284 {
285         refresh_cpu_vm_stats(smp_processor_id());
286 }
287
288 /*
289  * Consolidate all counters.
290  *
291  * Note that the result is less inaccurate but still inaccurate
292  * if concurrent processes are allowed to run.
293  */
294 void refresh_vm_stats(void)
295 {
296         on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
297 }
298 EXPORT_SYMBOL(refresh_vm_stats);
299
300 #endif
301
302 #ifdef CONFIG_NUMA
303 /*
304  * zonelist = the list of zones passed to the allocator
305  * z        = the zone from which the allocation occurred.
306  *
307  * Must be called with interrupts disabled.
308  */
309 void zone_statistics(struct zonelist *zonelist, struct zone *z)
310 {
311         if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
312                 __inc_zone_state(z, NUMA_HIT);
313         } else {
314                 __inc_zone_state(z, NUMA_MISS);
315                 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
316         }
317         if (z->zone_pgdat == NODE_DATA(numa_node_id()))
318                 __inc_zone_state(z, NUMA_LOCAL);
319         else
320                 __inc_zone_state(z, NUMA_OTHER);
321 }
322 #endif
323
324 #ifdef CONFIG_PROC_FS
325
326 #include <linux/seq_file.h>
327
328 static void *frag_start(struct seq_file *m, loff_t *pos)
329 {
330         pg_data_t *pgdat;
331         loff_t node = *pos;
332         for (pgdat = first_online_pgdat();
333              pgdat && node;
334              pgdat = next_online_pgdat(pgdat))
335                 --node;
336
337         return pgdat;
338 }
339
340 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
341 {
342         pg_data_t *pgdat = (pg_data_t *)arg;
343
344         (*pos)++;
345         return next_online_pgdat(pgdat);
346 }
347
348 static void frag_stop(struct seq_file *m, void *arg)
349 {
350 }
351
352 /*
353  * This walks the free areas for each zone.
354  */
355 static int frag_show(struct seq_file *m, void *arg)
356 {
357         pg_data_t *pgdat = (pg_data_t *)arg;
358         struct zone *zone;
359         struct zone *node_zones = pgdat->node_zones;
360         unsigned long flags;
361         int order;
362
363         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
364                 if (!populated_zone(zone))
365                         continue;
366
367                 spin_lock_irqsave(&zone->lock, flags);
368                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
369                 for (order = 0; order < MAX_ORDER; ++order)
370                         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
371                 spin_unlock_irqrestore(&zone->lock, flags);
372                 seq_putc(m, '\n');
373         }
374         return 0;
375 }
376
377 struct seq_operations fragmentation_op = {
378         .start  = frag_start,
379         .next   = frag_next,
380         .stop   = frag_stop,
381         .show   = frag_show,
382 };
383
384 static char *vmstat_text[] = {
385         /* Zoned VM counters */
386         "nr_anon_pages",
387         "nr_mapped",
388         "nr_file_pages",
389         "nr_slab",
390         "nr_page_table_pages",
391         "nr_dirty",
392         "nr_writeback",
393         "nr_unstable",
394         "nr_bounce",
395
396 #ifdef CONFIG_NUMA
397         "numa_hit",
398         "numa_miss",
399         "numa_foreign",
400         "numa_interleave",
401         "numa_local",
402         "numa_other",
403 #endif
404
405 #ifdef CONFIG_VM_EVENT_COUNTERS
406         "pgpgin",
407         "pgpgout",
408         "pswpin",
409         "pswpout",
410
411         "pgalloc_dma",
412         "pgalloc_dma32",
413         "pgalloc_normal",
414         "pgalloc_high",
415
416         "pgfree",
417         "pgactivate",
418         "pgdeactivate",
419
420         "pgfault",
421         "pgmajfault",
422
423         "pgrefill_dma",
424         "pgrefill_dma32",
425         "pgrefill_normal",
426         "pgrefill_high",
427
428         "pgsteal_dma",
429         "pgsteal_dma32",
430         "pgsteal_normal",
431         "pgsteal_high",
432
433         "pgscan_kswapd_dma",
434         "pgscan_kswapd_dma32",
435         "pgscan_kswapd_normal",
436         "pgscan_kswapd_high",
437
438         "pgscan_direct_dma",
439         "pgscan_direct_dma32",
440         "pgscan_direct_normal",
441         "pgscan_direct_high",
442
443         "pginodesteal",
444         "slabs_scanned",
445         "kswapd_steal",
446         "kswapd_inodesteal",
447         "pageoutrun",
448         "allocstall",
449
450         "pgrotated",
451 #endif
452 };
453
454 /*
455  * Output information about zones in @pgdat.
456  */
457 static int zoneinfo_show(struct seq_file *m, void *arg)
458 {
459         pg_data_t *pgdat = arg;
460         struct zone *zone;
461         struct zone *node_zones = pgdat->node_zones;
462         unsigned long flags;
463
464         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
465                 int i;
466
467                 if (!populated_zone(zone))
468                         continue;
469
470                 spin_lock_irqsave(&zone->lock, flags);
471                 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
472                 seq_printf(m,
473                            "\n  pages free     %lu"
474                            "\n        min      %lu"
475                            "\n        low      %lu"
476                            "\n        high     %lu"
477                            "\n        active   %lu"
478                            "\n        inactive %lu"
479                            "\n        scanned  %lu (a: %lu i: %lu)"
480                            "\n        spanned  %lu"
481                            "\n        present  %lu",
482                            zone->free_pages,
483                            zone->pages_min,
484                            zone->pages_low,
485                            zone->pages_high,
486                            zone->nr_active,
487                            zone->nr_inactive,
488                            zone->pages_scanned,
489                            zone->nr_scan_active, zone->nr_scan_inactive,
490                            zone->spanned_pages,
491                            zone->present_pages);
492
493                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
494                         seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
495                                         zone_page_state(zone, i));
496
497                 seq_printf(m,
498                            "\n        protection: (%lu",
499                            zone->lowmem_reserve[0]);
500                 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
501                         seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
502                 seq_printf(m,
503                            ")"
504                            "\n  pagesets");
505                 for_each_online_cpu(i) {
506                         struct per_cpu_pageset *pageset;
507                         int j;
508
509                         pageset = zone_pcp(zone, i);
510                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
511                                 if (pageset->pcp[j].count)
512                                         break;
513                         }
514                         if (j == ARRAY_SIZE(pageset->pcp))
515                                 continue;
516                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
517                                 seq_printf(m,
518                                            "\n    cpu: %i pcp: %i"
519                                            "\n              count: %i"
520                                            "\n              high:  %i"
521                                            "\n              batch: %i",
522                                            i, j,
523                                            pageset->pcp[j].count,
524                                            pageset->pcp[j].high,
525                                            pageset->pcp[j].batch);
526                         }
527                 }
528                 seq_printf(m,
529                            "\n  all_unreclaimable: %u"
530                            "\n  prev_priority:     %i"
531                            "\n  temp_priority:     %i"
532                            "\n  start_pfn:         %lu",
533                            zone->all_unreclaimable,
534                            zone->prev_priority,
535                            zone->temp_priority,
536                            zone->zone_start_pfn);
537                 spin_unlock_irqrestore(&zone->lock, flags);
538                 seq_putc(m, '\n');
539         }
540         return 0;
541 }
542
543 struct seq_operations zoneinfo_op = {
544         .start  = frag_start, /* iterate over all zones. The same as in
545                                * fragmentation. */
546         .next   = frag_next,
547         .stop   = frag_stop,
548         .show   = zoneinfo_show,
549 };
550
551 static void *vmstat_start(struct seq_file *m, loff_t *pos)
552 {
553         unsigned long *v;
554 #ifdef CONFIG_VM_EVENT_COUNTERS
555         unsigned long *e;
556 #endif
557         int i;
558
559         if (*pos >= ARRAY_SIZE(vmstat_text))
560                 return NULL;
561
562 #ifdef CONFIG_VM_EVENT_COUNTERS
563         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
564                         + sizeof(struct vm_event_state), GFP_KERNEL);
565 #else
566         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
567                         GFP_KERNEL);
568 #endif
569         m->private = v;
570         if (!v)
571                 return ERR_PTR(-ENOMEM);
572         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
573                 v[i] = global_page_state(i);
574 #ifdef CONFIG_VM_EVENT_COUNTERS
575         e = v + NR_VM_ZONE_STAT_ITEMS;
576         all_vm_events(e);
577         e[PGPGIN] /= 2;         /* sectors -> kbytes */
578         e[PGPGOUT] /= 2;
579 #endif
580         return v + *pos;
581 }
582
583 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
584 {
585         (*pos)++;
586         if (*pos >= ARRAY_SIZE(vmstat_text))
587                 return NULL;
588         return (unsigned long *)m->private + *pos;
589 }
590
591 static int vmstat_show(struct seq_file *m, void *arg)
592 {
593         unsigned long *l = arg;
594         unsigned long off = l - (unsigned long *)m->private;
595
596         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
597         return 0;
598 }
599
600 static void vmstat_stop(struct seq_file *m, void *arg)
601 {
602         kfree(m->private);
603         m->private = NULL;
604 }
605
606 struct seq_operations vmstat_op = {
607         .start  = vmstat_start,
608         .next   = vmstat_next,
609         .stop   = vmstat_stop,
610         .show   = vmstat_show,
611 };
612
613 #endif /* CONFIG_PROC_FS */
614