arch/i386/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/bug.h>
  44 #include <linux/sched.h>
  45
  46 #include <asm/pgtable.h>
  47 #include <asm/tlbflush.h>
  48 #include <asm/mmu_context.h>
  49 #include <asm/paravirt.h>
  50
  51 #include <asm/xen/hypercall.h>
  52 #include <asm/xen/hypervisor.h>
  53
  54 #include <xen/page.h>
  55 #include <xen/interface/xen.h>
  56
  57 #include "multicalls.h"
  58 #include "mmu.h"
  59
  60 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
  61 {
  62         pte_t *pte = lookup_address(address);
  63         unsigned offset = address & PAGE_MASK;
  64
  65         BUG_ON(pte == NULL);
  66
  67         return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
  68 }
  69
  70 void make_lowmem_page_readonly(void *vaddr)
  71 {
  72         pte_t *pte, ptev;
  73         unsigned long address = (unsigned long)vaddr;
  74
  75         pte = lookup_address(address);
  76         BUG_ON(pte == NULL);
  77
  78         ptev = pte_wrprotect(*pte);
  79
  80         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
  81                 BUG();
  82 }
  83
  84 void make_lowmem_page_readwrite(void *vaddr)
  85 {
  86         pte_t *pte, ptev;
  87         unsigned long address = (unsigned long)vaddr;
  88
  89         pte = lookup_address(address);
  90         BUG_ON(pte == NULL);
  91
  92         ptev = pte_mkwrite(*pte);
  93
  94         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
  95                 BUG();
  96 }
  97
  98
  99 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 100 {
 101         struct mmu_update u;
 102
 103         u.ptr = virt_to_machine(ptr).maddr;
 104         u.val = pmd_val_ma(val);
 105         if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
 106                 BUG();
 107 }
 108
 109 /*
 110  * Associate a virtual page frame with a given physical page frame
 111  * and protection flags for that frame.
 112  */
 113 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 114 {
 115         pgd_t *pgd;
 116         pud_t *pud;
 117         pmd_t *pmd;
 118         pte_t *pte;
 119
 120         pgd = swapper_pg_dir + pgd_index(vaddr);
 121         if (pgd_none(*pgd)) {
 122                 BUG();
 123                 return;
 124         }
 125         pud = pud_offset(pgd, vaddr);
 126         if (pud_none(*pud)) {
 127                 BUG();
 128                 return;
 129         }
 130         pmd = pmd_offset(pud, vaddr);
 131         if (pmd_none(*pmd)) {
 132                 BUG();
 133                 return;
 134         }
 135         pte = pte_offset_kernel(pmd, vaddr);
 136         /* <mfn,flags> stored as-is, to permit clearing entries */
 137         xen_set_pte(pte, mfn_pte(mfn, flags));
 138
 139         /*
 140          * It's enough to flush this one mapping.
 141          * (PGE mappings get flushed as well)
 142          */
 143         __flush_tlb_one(vaddr);
 144 }
 145
 146 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 147                     pte_t *ptep, pte_t pteval)
 148 {
 149         if ((mm != current->mm && mm != &init_mm) ||
 150             HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
 151                 xen_set_pte(ptep, pteval);
 152 }
 153
 154 #ifdef CONFIG_X86_PAE
 155 void xen_set_pud(pud_t *ptr, pud_t val)
 156 {
 157         struct mmu_update u;
 158
 159         u.ptr = virt_to_machine(ptr).maddr;
 160         u.val = pud_val_ma(val);
 161         if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
 162                 BUG();
 163 }
 164
 165 void xen_set_pte(pte_t *ptep, pte_t pte)
 166 {
 167         ptep->pte_high = pte.pte_high;
 168         smp_wmb();
 169         ptep->pte_low = pte.pte_low;
 170 }
 171
 172 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 173 {
 174         set_64bit((u64 *)ptep, pte_val_ma(pte));
 175 }
 176
 177 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 178 {
 179         ptep->pte_low = 0;
 180         smp_wmb();              /* make sure low gets written first */
 181         ptep->pte_high = 0;
 182 }
 183
 184 void xen_pmd_clear(pmd_t *pmdp)
 185 {
 186         xen_set_pmd(pmdp, __pmd(0));
 187 }
 188
 189 unsigned long long xen_pte_val(pte_t pte)
 190 {
 191         unsigned long long ret = 0;
 192
 193         if (pte.pte_low) {
 194                 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
 195                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 196         }
 197
 198         return ret;
 199 }
 200
 201 unsigned long long xen_pmd_val(pmd_t pmd)
 202 {
 203         unsigned long long ret = pmd.pmd;
 204         if (ret)
 205                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 206         return ret;
 207 }
 208
 209 unsigned long long xen_pgd_val(pgd_t pgd)
 210 {
 211         unsigned long long ret = pgd.pgd;
 212         if (ret)
 213                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 214         return ret;
 215 }
 216
 217 pte_t xen_make_pte(unsigned long long pte)
 218 {
 219         if (pte & 1)
 220                 pte = phys_to_machine(XPADDR(pte)).maddr;
 221
 222         return (pte_t){ pte, pte >> 32 };
 223 }
 224
 225 pmd_t xen_make_pmd(unsigned long long pmd)
 226 {
 227         if (pmd & 1)
 228                 pmd = phys_to_machine(XPADDR(pmd)).maddr;
 229
 230         return (pmd_t){ pmd };
 231 }
 232
 233 pgd_t xen_make_pgd(unsigned long long pgd)
 234 {
 235         if (pgd & _PAGE_PRESENT)
 236                 pgd = phys_to_machine(XPADDR(pgd)).maddr;
 237
 238         return (pgd_t){ pgd };
 239 }
 240 #else  /* !PAE */
 241 void xen_set_pte(pte_t *ptep, pte_t pte)
 242 {
 243         *ptep = pte;
 244 }
 245
 246 unsigned long xen_pte_val(pte_t pte)
 247 {
 248         unsigned long ret = pte.pte_low;
 249
 250         if (ret & _PAGE_PRESENT)
 251                 ret = machine_to_phys(XMADDR(ret)).paddr;
 252
 253         return ret;
 254 }
 255
 256 unsigned long xen_pgd_val(pgd_t pgd)
 257 {
 258         unsigned long ret = pgd.pgd;
 259         if (ret)
 260                 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
 261         return ret;
 262 }
 263
 264 pte_t xen_make_pte(unsigned long pte)
 265 {
 266         if (pte & _PAGE_PRESENT)
 267                 pte = phys_to_machine(XPADDR(pte)).maddr;
 268
 269         return (pte_t){ pte };
 270 }
 271
 272 pgd_t xen_make_pgd(unsigned long pgd)
 273 {
 274         if (pgd & _PAGE_PRESENT)
 275                 pgd = phys_to_machine(XPADDR(pgd)).maddr;
 276
 277         return (pgd_t){ pgd };
 278 }
 279 #endif  /* CONFIG_X86_PAE */
 280
 281
 282
 283 /*
 284   (Yet another) pagetable walker.  This one is intended for pinning a
 285   pagetable.  This means that it walks a pagetable and calls the
 286   callback function on each page it finds making up the page table,
 287   at every level.  It walks the entire pagetable, but it only bothers
 288   pinning pte pages which are below pte_limit.  In the normal case
 289   this will be TASK_SIZE, but at boot we need to pin up to
 290   FIXADDR_TOP.  But the important bit is that we don't pin beyond
 291   there, because then we start getting into Xen's ptes.
 292 */
 293 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 294                     unsigned long limit)
 295 {
 296         pgd_t *pgd = pgd_base;
 297         int flush = 0;
 298         unsigned long addr = 0;
 299         unsigned long pgd_next;
 300
 301         BUG_ON(limit > FIXADDR_TOP);
 302
 303         if (xen_feature(XENFEAT_auto_translated_physmap))
 304                 return 0;
 305
 306         for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
 307                 pud_t *pud;
 308                 unsigned long pud_limit, pud_next;
 309
 310                 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
 311
 312                 if (!pgd_val(*pgd))
 313                         continue;
 314
 315                 pud = pud_offset(pgd, 0);
 316
 317                 if (PTRS_PER_PUD > 1) /* not folded */
 318                         flush |= (*func)(virt_to_page(pud), 0);
 319
 320                 for (; addr != pud_limit; pud++, addr = pud_next) {
 321                         pmd_t *pmd;
 322                         unsigned long pmd_limit;
 323
 324                         pud_next = pud_addr_end(addr, pud_limit);
 325
 326                         if (pud_next < limit)
 327                                 pmd_limit = pud_next;
 328                         else
 329                                 pmd_limit = limit;
 330
 331                         if (pud_none(*pud))
 332                                 continue;
 333
 334                         pmd = pmd_offset(pud, 0);
 335
 336                         if (PTRS_PER_PMD > 1) /* not folded */
 337                                 flush |= (*func)(virt_to_page(pmd), 0);
 338
 339                         for (; addr != pmd_limit; pmd++) {
 340                                 addr += (PAGE_SIZE * PTRS_PER_PTE);
 341                                 if ((pmd_limit-1) < (addr-1)) {
 342                                         addr = pmd_limit;
 343                                         break;
 344                                 }
 345
 346                                 if (pmd_none(*pmd))
 347                                         continue;
 348
 349                                 flush |= (*func)(pmd_page(*pmd), 0);
 350                         }
 351                 }
 352         }
 353
 354         flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
 355
 356         return flush;
 357 }
 358
 359 static int pin_page(struct page *page, unsigned flags)
 360 {
 361         unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
 362         int flush;
 363
 364         if (pgfl)
 365                 flush = 0;              /* already pinned */
 366         else if (PageHighMem(page))
 367                 /* kmaps need flushing if we found an unpinned
 368                    highpage */
 369                 flush = 1;
 370         else {
 371                 void *pt = lowmem_page_address(page);
 372                 unsigned long pfn = page_to_pfn(page);
 373                 struct multicall_space mcs = __xen_mc_entry(0);
 374
 375                 flush = 0;
 376
 377                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 378                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 379                                         flags);
 380         }
 381
 382         return flush;
 383 }
 384
 385 /* This is called just after a mm has been created, but it has not
 386    been used yet.  We need to make sure that its pagetable is all
 387    read-only, and can be pinned. */
 388 void xen_pgd_pin(pgd_t *pgd)
 389 {
 390         struct multicall_space mcs;
 391         struct mmuext_op *op;
 392
 393         xen_mc_batch();
 394
 395         if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
 396                 /* re-enable interrupts for kmap_flush_unused */
 397                 xen_mc_issue(0);
 398                 kmap_flush_unused();
 399                 xen_mc_batch();
 400         }
 401
 402         mcs = __xen_mc_entry(sizeof(*op));
 403         op = mcs.args;
 404
 405 #ifdef CONFIG_X86_PAE
 406         op->cmd = MMUEXT_PIN_L3_TABLE;
 407 #else
 408         op->cmd = MMUEXT_PIN_L2_TABLE;
 409 #endif
 410         op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
 411         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 412
 413         xen_mc_issue(0);
 414 }
 415
 416 /* The init_mm pagetable is really pinned as soon as its created, but
 417    that's before we have page structures to store the bits.  So do all
 418    the book-keeping now. */
 419 static __init int mark_pinned(struct page *page, unsigned flags)
 420 {
 421         SetPagePinned(page);
 422         return 0;
 423 }
 424
 425 void __init xen_mark_init_mm_pinned(void)
 426 {
 427         pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 428 }
 429
 430 static int unpin_page(struct page *page, unsigned flags)
 431 {
 432         unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
 433
 434         if (pgfl && !PageHighMem(page)) {
 435                 void *pt = lowmem_page_address(page);
 436                 unsigned long pfn = page_to_pfn(page);
 437                 struct multicall_space mcs = __xen_mc_entry(0);
 438
 439                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 440                                         pfn_pte(pfn, PAGE_KERNEL),
 441                                         flags);
 442         }
 443
 444         return 0;               /* never need to flush on unpin */
 445 }
 446
 447 /* Release a pagetables pages back as normal RW */
 448 static void xen_pgd_unpin(pgd_t *pgd)
 449 {
 450         struct mmuext_op *op;
 451         struct multicall_space mcs;
 452
 453         xen_mc_batch();
 454
 455         mcs = __xen_mc_entry(sizeof(*op));
 456
 457         op = mcs.args;
 458         op->cmd = MMUEXT_UNPIN_TABLE;
 459         op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
 460
 461         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 462
 463         pgd_walk(pgd, unpin_page, TASK_SIZE);
 464
 465         xen_mc_issue(0);
 466 }
 467
 468 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 469 {
 470         spin_lock(&next->page_table_lock);
 471         xen_pgd_pin(next->pgd);
 472         spin_unlock(&next->page_table_lock);
 473 }
 474
 475 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 476 {
 477         spin_lock(&mm->page_table_lock);
 478         xen_pgd_pin(mm->pgd);
 479         spin_unlock(&mm->page_table_lock);
 480 }
 481
 482
 483 #ifdef CONFIG_SMP
 484 /* Another cpu may still have their %cr3 pointing at the pagetable, so
 485    we need to repoint it somewhere else before we can unpin it. */
 486 static void drop_other_mm_ref(void *info)
 487 {
 488         struct mm_struct *mm = info;
 489
 490         if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
 491                 leave_mm(smp_processor_id());
 492 }
 493
 494 static void drop_mm_ref(struct mm_struct *mm)
 495 {
 496         if (current->active_mm == mm) {
 497                 if (current->mm == mm)
 498                         load_cr3(swapper_pg_dir);
 499                 else
 500                         leave_mm(smp_processor_id());
 501         }
 502
 503         if (!cpus_empty(mm->cpu_vm_mask))
 504                 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
 505                                            mm, 1);
 506 }
 507 #else
 508 static void drop_mm_ref(struct mm_struct *mm)
 509 {
 510         if (current->active_mm == mm)
 511                 load_cr3(swapper_pg_dir);
 512 }
 513 #endif
 514
 515 /*
 516  * While a process runs, Xen pins its pagetables, which means that the
 517  * hypervisor forces it to be read-only, and it controls all updates
 518  * to it.  This means that all pagetable updates have to go via the
 519  * hypervisor, which is moderately expensive.
 520  *
 521  * Since we're pulling the pagetable down, we switch to use init_mm,
 522  * unpin old process pagetable and mark it all read-write, which
 523  * allows further operations on it to be simple memory accesses.
 524  *
 525  * The only subtle point is that another CPU may be still using the
 526  * pagetable because of lazy tlb flushing.  This means we need need to
 527  * switch all CPUs off this pagetable before we can unpin it.
 528  */
 529 void xen_exit_mmap(struct mm_struct *mm)
 530 {
 531         get_cpu();              /* make sure we don't move around */
 532         drop_mm_ref(mm);
 533         put_cpu();
 534
 535         spin_lock(&mm->page_table_lock);
 536         xen_pgd_unpin(mm->pgd);
 537         spin_unlock(&mm->page_table_lock);
 538 }