4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
44 #include <linux/sched.h>
46 #include <asm/pgtable.h>
47 #include <asm/tlbflush.h>
48 #include <asm/mmu_context.h>
49 #include <asm/paravirt.h>
51 #include <asm/xen/hypercall.h>
52 #include <asm/xen/hypervisor.h>
55 #include <xen/interface/xen.h>
57 #include "multicalls.h"
60 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
62 pte_t *pte = lookup_address(address);
63 unsigned offset = address & PAGE_MASK;
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
70 void make_lowmem_page_readonly(void *vaddr)
73 unsigned long address = (unsigned long)vaddr;
75 pte = lookup_address(address);
78 ptev = pte_wrprotect(*pte);
80 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
84 void make_lowmem_page_readwrite(void *vaddr)
87 unsigned long address = (unsigned long)vaddr;
89 pte = lookup_address(address);
92 ptev = pte_mkwrite(*pte);
94 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
99 void xen_set_pmd(pmd_t *ptr, pmd_t val)
103 u.ptr = virt_to_machine(ptr).maddr;
104 u.val = pmd_val_ma(val);
105 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
110 * Associate a virtual page frame with a given physical page frame
111 * and protection flags for that frame.
113 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
120 pgd = swapper_pg_dir + pgd_index(vaddr);
121 if (pgd_none(*pgd)) {
125 pud = pud_offset(pgd, vaddr);
126 if (pud_none(*pud)) {
130 pmd = pmd_offset(pud, vaddr);
131 if (pmd_none(*pmd)) {
135 pte = pte_offset_kernel(pmd, vaddr);
136 /* <mfn,flags> stored as-is, to permit clearing entries */
137 xen_set_pte(pte, mfn_pte(mfn, flags));
140 * It's enough to flush this one mapping.
141 * (PGE mappings get flushed as well)
143 __flush_tlb_one(vaddr);
146 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
147 pte_t *ptep, pte_t pteval)
149 if ((mm != current->mm && mm != &init_mm) ||
150 HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
151 xen_set_pte(ptep, pteval);
154 #ifdef CONFIG_X86_PAE
155 void xen_set_pud(pud_t *ptr, pud_t val)
159 u.ptr = virt_to_machine(ptr).maddr;
160 u.val = pud_val_ma(val);
161 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
165 void xen_set_pte(pte_t *ptep, pte_t pte)
167 ptep->pte_high = pte.pte_high;
169 ptep->pte_low = pte.pte_low;
172 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
174 set_64bit((u64 *)ptep, pte_val_ma(pte));
177 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
180 smp_wmb(); /* make sure low gets written first */
184 void xen_pmd_clear(pmd_t *pmdp)
186 xen_set_pmd(pmdp, __pmd(0));
189 unsigned long long xen_pte_val(pte_t pte)
191 unsigned long long ret = 0;
194 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
195 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
201 unsigned long long xen_pmd_val(pmd_t pmd)
203 unsigned long long ret = pmd.pmd;
205 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
209 unsigned long long xen_pgd_val(pgd_t pgd)
211 unsigned long long ret = pgd.pgd;
213 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
217 pte_t xen_make_pte(unsigned long long pte)
220 pte = phys_to_machine(XPADDR(pte)).maddr;
222 return (pte_t){ pte, pte >> 32 };
225 pmd_t xen_make_pmd(unsigned long long pmd)
228 pmd = phys_to_machine(XPADDR(pmd)).maddr;
230 return (pmd_t){ pmd };
233 pgd_t xen_make_pgd(unsigned long long pgd)
235 if (pgd & _PAGE_PRESENT)
236 pgd = phys_to_machine(XPADDR(pgd)).maddr;
238 return (pgd_t){ pgd };
241 void xen_set_pte(pte_t *ptep, pte_t pte)
246 unsigned long xen_pte_val(pte_t pte)
248 unsigned long ret = pte.pte_low;
250 if (ret & _PAGE_PRESENT)
251 ret = machine_to_phys(XMADDR(ret)).paddr;
256 unsigned long xen_pgd_val(pgd_t pgd)
258 unsigned long ret = pgd.pgd;
260 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
264 pte_t xen_make_pte(unsigned long pte)
266 if (pte & _PAGE_PRESENT)
267 pte = phys_to_machine(XPADDR(pte)).maddr;
269 return (pte_t){ pte };
272 pgd_t xen_make_pgd(unsigned long pgd)
274 if (pgd & _PAGE_PRESENT)
275 pgd = phys_to_machine(XPADDR(pgd)).maddr;
277 return (pgd_t){ pgd };
279 #endif /* CONFIG_X86_PAE */
284 (Yet another) pagetable walker. This one is intended for pinning a
285 pagetable. This means that it walks a pagetable and calls the
286 callback function on each page it finds making up the page table,
287 at every level. It walks the entire pagetable, but it only bothers
288 pinning pte pages which are below pte_limit. In the normal case
289 this will be TASK_SIZE, but at boot we need to pin up to
290 FIXADDR_TOP. But the important bit is that we don't pin beyond
291 there, because then we start getting into Xen's ptes.
293 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
296 pgd_t *pgd = pgd_base;
298 unsigned long addr = 0;
299 unsigned long pgd_next;
301 BUG_ON(limit > FIXADDR_TOP);
303 if (xen_feature(XENFEAT_auto_translated_physmap))
306 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
308 unsigned long pud_limit, pud_next;
310 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
315 pud = pud_offset(pgd, 0);
317 if (PTRS_PER_PUD > 1) /* not folded */
318 flush |= (*func)(virt_to_page(pud), 0);
320 for (; addr != pud_limit; pud++, addr = pud_next) {
322 unsigned long pmd_limit;
324 pud_next = pud_addr_end(addr, pud_limit);
326 if (pud_next < limit)
327 pmd_limit = pud_next;
334 pmd = pmd_offset(pud, 0);
336 if (PTRS_PER_PMD > 1) /* not folded */
337 flush |= (*func)(virt_to_page(pmd), 0);
339 for (; addr != pmd_limit; pmd++) {
340 addr += (PAGE_SIZE * PTRS_PER_PTE);
341 if ((pmd_limit-1) < (addr-1)) {
349 flush |= (*func)(pmd_page(*pmd), 0);
354 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
359 static int pin_page(struct page *page, unsigned flags)
361 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
365 flush = 0; /* already pinned */
366 else if (PageHighMem(page))
367 /* kmaps need flushing if we found an unpinned
371 void *pt = lowmem_page_address(page);
372 unsigned long pfn = page_to_pfn(page);
373 struct multicall_space mcs = __xen_mc_entry(0);
377 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
378 pfn_pte(pfn, PAGE_KERNEL_RO),
385 /* This is called just after a mm has been created, but it has not
386 been used yet. We need to make sure that its pagetable is all
387 read-only, and can be pinned. */
388 void xen_pgd_pin(pgd_t *pgd)
390 struct multicall_space mcs;
391 struct mmuext_op *op;
395 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
396 /* re-enable interrupts for kmap_flush_unused */
402 mcs = __xen_mc_entry(sizeof(*op));
405 #ifdef CONFIG_X86_PAE
406 op->cmd = MMUEXT_PIN_L3_TABLE;
408 op->cmd = MMUEXT_PIN_L2_TABLE;
410 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
411 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
416 /* The init_mm pagetable is really pinned as soon as its created, but
417 that's before we have page structures to store the bits. So do all
418 the book-keeping now. */
419 static __init int mark_pinned(struct page *page, unsigned flags)
425 void __init xen_mark_init_mm_pinned(void)
427 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
430 static int unpin_page(struct page *page, unsigned flags)
432 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
434 if (pgfl && !PageHighMem(page)) {
435 void *pt = lowmem_page_address(page);
436 unsigned long pfn = page_to_pfn(page);
437 struct multicall_space mcs = __xen_mc_entry(0);
439 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
440 pfn_pte(pfn, PAGE_KERNEL),
444 return 0; /* never need to flush on unpin */
447 /* Release a pagetables pages back as normal RW */
448 static void xen_pgd_unpin(pgd_t *pgd)
450 struct mmuext_op *op;
451 struct multicall_space mcs;
455 mcs = __xen_mc_entry(sizeof(*op));
458 op->cmd = MMUEXT_UNPIN_TABLE;
459 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
461 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
463 pgd_walk(pgd, unpin_page, TASK_SIZE);
468 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
470 spin_lock(&next->page_table_lock);
471 xen_pgd_pin(next->pgd);
472 spin_unlock(&next->page_table_lock);
475 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
477 spin_lock(&mm->page_table_lock);
478 xen_pgd_pin(mm->pgd);
479 spin_unlock(&mm->page_table_lock);
484 /* Another cpu may still have their %cr3 pointing at the pagetable, so
485 we need to repoint it somewhere else before we can unpin it. */
486 static void drop_other_mm_ref(void *info)
488 struct mm_struct *mm = info;
490 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
491 leave_mm(smp_processor_id());
494 static void drop_mm_ref(struct mm_struct *mm)
496 if (current->active_mm == mm) {
497 if (current->mm == mm)
498 load_cr3(swapper_pg_dir);
500 leave_mm(smp_processor_id());
503 if (!cpus_empty(mm->cpu_vm_mask))
504 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
508 static void drop_mm_ref(struct mm_struct *mm)
510 if (current->active_mm == mm)
511 load_cr3(swapper_pg_dir);
516 * While a process runs, Xen pins its pagetables, which means that the
517 * hypervisor forces it to be read-only, and it controls all updates
518 * to it. This means that all pagetable updates have to go via the
519 * hypervisor, which is moderately expensive.
521 * Since we're pulling the pagetable down, we switch to use init_mm,
522 * unpin old process pagetable and mark it all read-write, which
523 * allows further operations on it to be simple memory accesses.
525 * The only subtle point is that another CPU may be still using the
526 * pagetable because of lazy tlb flushing. This means we need need to
527 * switch all CPUs off this pagetable before we can unpin it.
529 void xen_exit_mmap(struct mm_struct *mm)
531 get_cpu(); /* make sure we don't move around */
535 spin_lock(&mm->page_table_lock);
536 xen_pgd_unpin(mm->pgd);
537 spin_unlock(&mm->page_table_lock);