x86: clean up the page table dumper and add 32-bit support
H. Peter Anvin [Thu, 17 Apr 2008 15:40:45 +0000 (17:40 +0200)]
Clean up the page table dumper (fix boundary conditions, table driven
address ranges, some formatting changes since it is no longer using
the kernel log but a separate virtual file), and generalize to 32
bits.

[ mingo@elte.hu: x86: fix the pagetable dumper ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

arch/x86/Kconfig.debug
arch/x86/mm/Makefile
arch/x86/mm/dump_pagetables.c

index cb7002e..7ce8e70 100644 (file)
@@ -56,7 +56,7 @@ config DEBUG_PER_CPU_MAPS
 
 config X86_PTDUMP
        bool "Export kernel pagetable layout to userspace via debugfs"
-       depends on X86_64
+       depends on DEBUG_KERNEL
        select DEBUG_FS
        help
          Say Y here if you want to show the kernel pagetable layout in a
index 28632f4..9ab9889 100644 (file)
@@ -3,6 +3,7 @@ obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
 obj-$(CONFIG_X86_32)           += pgtable_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP)       += dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)          += highmem_32.o
 
@@ -12,5 +13,4 @@ else
 obj-$(CONFIG_NUMA)             += numa_64.o
 obj-$(CONFIG_K8_NUMA)          += k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)                += srat_64.o
-obj-$(CONFIG_X86_PTDUMP)       += dump_pagetables.o
 endif
index 5e7f643..6d84033 100644 (file)
  * of the License.
  */
 
+#include <linux/debugfs.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
-#include <linux/debugfs.h>
 
 #include <asm/pgtable.h>
 
@@ -28,73 +29,107 @@ struct pg_state {
        pgprot_t current_prot;
        unsigned long start_address;
        unsigned long current_address;
-       int printed_vmalloc;
-       int printed_modules;
-       int printed_vmemmap;
-       int printed_highmap;
+       const struct addr_marker *marker;
 };
 
-/* Multipliers for offsets within the PTEs */
-#define LEVEL_4_MULT (PAGE_SIZE)
-#define LEVEL_3_MULT (512UL * LEVEL_4_MULT)
-#define LEVEL_2_MULT (512UL * LEVEL_3_MULT)
-#define LEVEL_1_MULT (512UL * LEVEL_2_MULT)
+struct addr_marker {
+       unsigned long start_address;
+       const char *name;
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+       { 0, "User Space" },
+#ifdef CONFIG_X86_64
+       { 0x8000000000000000UL, "Kernel Space" },
+       { 0xffff810000000000UL, "Low Kernel Mapping" },
+       { VMALLOC_START,        "vmalloc() Area" },
+       { MODULES_VADDR,        "Modules" },
+       { MODULES_END,          "End Modules" },
+       { VMEMMAP_START,        "Vmemmap" },
+       { __START_KERNEL_map,   "High Kernel Mapping" },
+#else
+       { PAGE_OFFSET,          "Kernel Mapping" },
+       { 0/* VMALLOC_START */, "vmalloc() Area" },
+       { 0/*VMALLOC_END*/,     "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+       { 0/*PKMAP_BASE*/,      "Persisent kmap() Area" },
+# endif
+       { 0/*FIXADDR_START*/,   "Fixmap Area" },
+#endif
+       { -1, NULL }            /* End of list */
+};
 
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
 
 /*
  * Print a readable form of a pgprot_t to the seq_file
  */
 static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
 {
-       unsigned long pr = pgprot_val(prot);
-
-       if (pr & _PAGE_USER)
-               seq_printf(m, "USR ");
-       else
-               seq_printf(m, "    ");
-       if (pr & _PAGE_RW)
-               seq_printf(m, "RW ");
-       else
-               seq_printf(m, "ro ");
-       if (pr & _PAGE_PWT)
-               seq_printf(m, "PWT ");
-       else
-               seq_printf(m, "    ");
-       if (pr & _PAGE_PCD)
-               seq_printf(m, "PCD ");
-       else
-               seq_printf(m, "    ");
-
-       /* Bit 9 has a different meaning on level 3 vs 4 */
-       if (level <= 3) {
-               if (pr & _PAGE_PSE)
-                       seq_printf(m, "PSE ");
+       pgprotval_t pr = pgprot_val(prot);
+       static const char * const level_name[] =
+               { "cr3", "pgd", "pud", "pmd", "pte" };
+
+       if (!pgprot_val(prot)) {
+               /* Not present */
+               seq_printf(m, "                          ");
+       } else {
+               if (pr & _PAGE_USER)
+                       seq_printf(m, "USR ");
                else
                        seq_printf(m, "    ");
-       } else {
-               if (pr & _PAGE_PAT)
-                       seq_printf(m, "pat ");
+               if (pr & _PAGE_RW)
+                       seq_printf(m, "RW ");
+               else
+                       seq_printf(m, "ro ");
+               if (pr & _PAGE_PWT)
+                       seq_printf(m, "PWT ");
+               else
+                       seq_printf(m, "    ");
+               if (pr & _PAGE_PCD)
+                       seq_printf(m, "PCD ");
                else
                        seq_printf(m, "    ");
+
+               /* Bit 9 has a different meaning on level 3 vs 4 */
+               if (level <= 3) {
+                       if (pr & _PAGE_PSE)
+                               seq_printf(m, "PSE ");
+                       else
+                               seq_printf(m, "    ");
+               } else {
+                       if (pr & _PAGE_PAT)
+                               seq_printf(m, "pat ");
+                       else
+                               seq_printf(m, "    ");
+               }
+               if (pr & _PAGE_GLOBAL)
+                       seq_printf(m, "GLB ");
+               else
+                       seq_printf(m, "    ");
+               if (pr & _PAGE_NX)
+                       seq_printf(m, "NX ");
+               else
+                       seq_printf(m, "x  ");
        }
-       if (pr & _PAGE_GLOBAL)
-               seq_printf(m, "GLB ");
-       else
-               seq_printf(m, "    ");
-       if (pr & _PAGE_NX)
-               seq_printf(m, "NX ");
-       else
-               seq_printf(m, "x  ");
+       seq_printf(m, "%s\n", level_name[level]);
 }
 
 /*
- * Sign-extend the 48 bit address to 64 bit
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
  */
-static unsigned long sign_extend(unsigned long u)
+static unsigned long normalize_addr(unsigned long u)
 {
-       if (u>>47)
-               u = u | (0xffffUL << 48);
+#ifdef CONFIG_X86_64
+       return (signed long)(u << 16) >> 16;
+#else
        return u;
+#endif
 }
 
 /*
@@ -103,81 +138,62 @@ static unsigned long sign_extend(unsigned long u)
  * print what we collected so far.
  */
 static void note_page(struct seq_file *m, struct pg_state *st,
-                                       pgprot_t new_prot, int level)
+                     pgprot_t new_prot, int level)
 {
-       unsigned long prot, cur;
+       pgprotval_t prot, cur;
+       static const char units[] = "KMGTPE";
 
        /*
         * If we have a "break" in the series, we need to flush the state that
-        * we have now. "break" is either changing perms or a different level.
+        * we have now. "break" is either changing perms, levels or
+        * address space marker.
         */
        prot = pgprot_val(new_prot) & ~(PTE_MASK);
        cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
 
-       if ((prot != cur || level != st->level) &&
-                               st->current_address != st->start_address) {
-               char unit = 'K';
+       if (!st->level) {
+               /* First entry */
+               st->current_prot = new_prot;
+               st->level = level;
+               st->marker = address_markers;
+               seq_printf(m, "---[ %s ]---\n", st->marker->name);
+       } else if (prot != cur || level != st->level ||
+                  st->current_address >= st->marker[1].start_address) {
+               const char *unit = units;
                unsigned long delta;
 
                /*
-                * We print markers for special areas of address space,
-                * such as the start of vmalloc space etc.
-                * This helps in the interpretation.
-                */
-               if (!st->printed_vmalloc &&
-                               st->start_address >= VMALLOC_START) {
-                       seq_printf(m, "---[ VMALLOC SPACE ]---\n");
-                       st->printed_vmalloc = 1;
-               }
-               if (!st->printed_modules &&
-                               st->start_address >= MODULES_VADDR) {
-                       seq_printf(m, "---[ MODULES SPACE ]---\n");
-                       st->printed_modules = 1;
-               }
-               if (st->printed_modules < 2 &&
-                               st->start_address >= MODULES_END) {
-                       seq_printf(m, "---[ END MODULES SPACE ]---\n");
-                       st->printed_modules = 2;
-               }
-               if (!st->printed_vmemmap &&
-                               st->start_address >= VMEMMAP_START) {
-                       seq_printf(m, "---[ VMMEMMAP SPACE ]---\n");
-                       st->printed_vmemmap = 1;
-               }
-               if (!st->printed_highmap &&
-                               st->start_address >= __START_KERNEL_map) {
-                       seq_printf(m, "---[ HIGH KERNEL MAPPING ]---\n");
-                       st->printed_highmap = 1;
-               }
-
-               /*
                 * Now print the actual finished series
                 */
-               seq_printf(m, "[ %016lx -  %016lx   ",
-                               st->start_address, st->current_address);
+               seq_printf(m, "0x%p-0x%p   ",
+                          (void *)st->start_address,
+                          (void *)st->current_address);
 
                delta = (st->current_address - st->start_address) >> 10;
-               if ((delta & 1023) == 0) {
-                       delta = delta >> 10;
-                       unit = 'M';
+               while (!(delta & 1023) && unit[1]) {
+                       delta >>= 10;
+                       unit++;
                }
-               if (pgprot_val(st->current_prot)) {
-                       seq_printf(m, "Size %9lu%cb ", delta, unit);
-                       printk_prot(m, st->current_prot, st->level);
-                       seq_printf(m, "L%i]\n", st->level);
-               } else {
-                       /* don't print protections on non-present memory */
-                       seq_printf(m, "%14lu%cb", delta, unit);
-                       seq_printf(m, "                           L%i]\n",
-                                       st->level);
+               seq_printf(m, "%9lu%c ", delta, *unit);
+               printk_prot(m, st->current_prot, st->level);
+
+               /*
+                * We print markers for special areas of address space,
+                * such as the start of vmalloc space etc.
+                * This helps in the interpretation.
+                */
+               if (st->current_address >= st->marker[1].start_address) {
+                       st->marker++;
+                       seq_printf(m, "---[ %s ]---\n", st->marker->name);
                }
+
                st->start_address = st->current_address;
                st->current_prot = new_prot;
                st->level = level;
-       };
+       }
 }
 
-static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
                                                        unsigned long P)
 {
        int i;
@@ -187,14 +203,15 @@ static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
        for (i = 0; i < PTRS_PER_PTE; i++) {
                pgprot_t prot = pte_pgprot(*start);
 
-               st->current_address = sign_extend(P + i * LEVEL_4_MULT);
+               st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
                note_page(m, st, prot, 4);
                start++;
        }
 }
 
+#if PTRS_PER_PMD > 1
 
-static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
                                                        unsigned long P)
 {
        int i;
@@ -202,25 +219,30 @@ static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
 
        start = (pmd_t *) pud_page_vaddr(addr);
        for (i = 0; i < PTRS_PER_PMD; i++) {
-               st->current_address = sign_extend(P + i * LEVEL_3_MULT);
+               st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
                if (!pmd_none(*start)) {
-                       unsigned long prot;
+                       pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
 
-                       prot = pmd_val(*start) & ~(PTE_MASK);
-                       /* Deal with 2Mb pages */
-                       if (pmd_large(*start))
+                       if (pmd_large(*start) || !pmd_present(*start))
                                note_page(m, st, __pgprot(prot), 3);
                        else
-                               walk_level_4(m, st, *start,
-                                                       P + i * LEVEL_3_MULT);
+                               walk_pte_level(m, st, *start,
+                                              P + i * PMD_LEVEL_MULT);
                } else
                        note_page(m, st, __pgprot(0), 3);
                start++;
        }
 }
 
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
 
-static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
                                                        unsigned long P)
 {
        int i;
@@ -229,16 +251,15 @@ static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
        start = (pud_t *) pgd_page_vaddr(addr);
 
        for (i = 0; i < PTRS_PER_PUD; i++) {
+               st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
                if (!pud_none(*start)) {
-                       unsigned long prot;
+                       pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
 
-                       prot = pud_val(*start) & ~(PTE_MASK);
-                       /* Deal with 1Gb pages */
-                       if (pud_large(*start))
+                       if (pud_large(*start) || !pud_present(*start))
                                note_page(m, st, __pgprot(prot), 2);
                        else
-                               walk_level_3(m, st, *start,
-                                       P + i * LEVEL_2_MULT);
+                               walk_pmd_level(m, st, *start,
+                                              P + i * PUD_LEVEL_MULT);
                } else
                        note_page(m, st, __pgprot(0), 2);
 
@@ -246,28 +267,48 @@ static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
        }
 }
 
-static void walk_level_1(struct seq_file *m)
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
 {
+#ifdef CONFIG_X86_64
        pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+       pgd_t *start = swapper_pg_dir;
+#endif
        int i;
        struct pg_state st;
 
        memset(&st, 0, sizeof(st));
-       st.level = 1;
 
        for (i = 0; i < PTRS_PER_PGD; i++) {
-               if (!pgd_none(*start))
-                       walk_level_2(m, &st, *start, i * LEVEL_1_MULT);
-               else
+               st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+               if (!pgd_none(*start)) {
+                       pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+
+                       if (pgd_large(*start) || !pgd_present(*start))
+                               note_page(m, &st, __pgprot(prot), 1);
+                       else
+                               walk_pud_level(m, &st, *start,
+                                              i * PGD_LEVEL_MULT);
+               } else
                        note_page(m, &st, __pgprot(0), 1);
+
                start++;
        }
+
+       /* Flush out the last page */
+       st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+       note_page(m, &st, __pgprot(0), 0);
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-       seq_puts(m, "Kernel pagetable dump\n");
-       walk_level_1(m);
+       walk_pgd_level(m);
        return 0;
 }
 
@@ -287,6 +328,18 @@ int pt_dump_init(void)
 {
        struct dentry *pe;
 
+#ifdef CONFIG_X86_32
+       /* Not a compile-time constant on x86-32 */
+       address_markers[2].start_address = VMALLOC_START;
+       address_markers[3].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+       address_markers[4].start_address = PKMAP_BASE;
+       address_markers[5].start_address = FIXADDR_START;
+# else
+       address_markers[4].start_address = FIXADDR_START;
+# endif
+#endif
+
        pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
                                 &ptdump_fops);
        if (!pe)