[S390] kvm guest address space mapping
[linux-2.6.git] / arch / s390 / mm / pgtable.c
index 37a23c2..2adb239 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/quicklist.h>
 #include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -133,30 +134,374 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
 }
 #endif
 
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+#ifdef CONFIG_PGSTE
+
+/**
+ * gmap_alloc - allocate a guest address space
+ * @mm: pointer to the parent mm_struct
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_alloc(struct mm_struct *mm)
 {
-       unsigned int old, new;
+       struct gmap *gmap;
+       struct page *page;
+       unsigned long *table;
 
-       do {
-               old = atomic_read(v);
-               new = old ^ bits;
-       } while (atomic_cmpxchg(v, old, new) != old);
-       return new;
+       gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+       if (!gmap)
+               goto out;
+       INIT_LIST_HEAD(&gmap->crst_list);
+       gmap->mm = mm;
+       page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
+       if (!page)
+               goto out_free;
+       list_add(&page->lru, &gmap->crst_list);
+       table = (unsigned long *) page_to_phys(page);
+       crst_table_init(table, _REGION1_ENTRY_EMPTY);
+       gmap->table = table;
+       list_add(&gmap->list, &mm->context.gmap_list);
+       return gmap;
+
+out_free:
+       kfree(gmap);
+out:
+       return NULL;
 }
+EXPORT_SYMBOL_GPL(gmap_alloc);
 
-/*
- * page table entry allocation/free routines.
+static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
+{
+       struct gmap_pgtable *mp;
+       struct gmap_rmap *rmap;
+       struct page *page;
+
+       if (*table & _SEGMENT_ENTRY_INV)
+               return 0;
+       page = pfn_to_page(*table >> PAGE_SHIFT);
+       mp = (struct gmap_pgtable *) page->index;
+       list_for_each_entry(rmap, &mp->mapper, list) {
+               if (rmap->entry != table)
+                       continue;
+               list_del(&rmap->list);
+               kfree(rmap);
+               break;
+       }
+       *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+       return 1;
+}
+
+static void gmap_flush_tlb(struct gmap *gmap)
+{
+       if (MACHINE_HAS_IDTE)
+               __tlb_flush_idte((unsigned long) gmap->table |
+                                _ASCE_TYPE_REGION1);
+       else
+               __tlb_flush_global();
+}
+
+/**
+ * gmap_free - free a guest address space
+ * @gmap: pointer to the guest address space structure
  */
-#ifdef CONFIG_PGSTE
-static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
+void gmap_free(struct gmap *gmap)
+{
+       struct page *page, *next;
+       unsigned long *table;
+       int i;
+
+
+       /* Flush tlb. */
+       if (MACHINE_HAS_IDTE)
+               __tlb_flush_idte((unsigned long) gmap->table |
+                                _ASCE_TYPE_REGION1);
+       else
+               __tlb_flush_global();
+
+       /* Free all segment & region tables. */
+       down_read(&gmap->mm->mmap_sem);
+       list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
+               table = (unsigned long *) page_to_phys(page);
+               if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
+                       /* Remove gmap rmap structures for segment table. */
+                       for (i = 0; i < PTRS_PER_PMD; i++, table++)
+                               gmap_unlink_segment(gmap, table);
+               __free_pages(page, ALLOC_ORDER);
+       }
+       up_read(&gmap->mm->mmap_sem);
+       list_del(&gmap->list);
+       kfree(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_enable - switch primary space to the guest address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_enable(struct gmap *gmap)
+{
+       /* Load primary space page table origin. */
+       S390_lowcore.user_asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
+                                _ASCE_USER_BITS | __pa(gmap->table);
+       asm volatile("lctlg 1,1,%0\n" : : "m" (S390_lowcore.user_asce) );
+       S390_lowcore.gmap = (unsigned long) gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_enable);
+
+/**
+ * gmap_disable - switch back to the standard primary address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_disable(struct gmap *gmap)
+{
+       /* Load primary space page table origin. */
+       S390_lowcore.user_asce =
+               gmap->mm->context.asce_bits | __pa(gmap->mm->pgd);
+       asm volatile("lctlg 1,1,%0\n" : : "m" (S390_lowcore.user_asce) );
+       S390_lowcore.gmap = 0UL;
+}
+EXPORT_SYMBOL_GPL(gmap_disable);
+
+static int gmap_alloc_table(struct gmap *gmap,
+                              unsigned long *table, unsigned long init)
+{
+       struct page *page;
+       unsigned long *new;
+
+       page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
+       if (!page)
+               return -ENOMEM;
+       new = (unsigned long *) page_to_phys(page);
+       crst_table_init(new, init);
+       down_read(&gmap->mm->mmap_sem);
+       if (*table & _REGION_ENTRY_INV) {
+               list_add(&page->lru, &gmap->crst_list);
+               *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+                       (*table & _REGION_ENTRY_TYPE_MASK);
+       } else
+               __free_pages(page, ALLOC_ORDER);
+       up_read(&gmap->mm->mmap_sem);
+       return 0;
+}
+
+/**
+ * gmap_unmap_segment - unmap segment from the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @addr: address in the guest address space
+ * @len: length of the memory area to unmap
+ *
+ * Returns 0 if the unmap succeded, -EINVAL if not.
+ */
+int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
+{
+       unsigned long *table;
+       unsigned long off;
+       int flush;
+
+       if ((to | len) & (PMD_SIZE - 1))
+               return -EINVAL;
+       if (len == 0 || to + len < to)
+               return -EINVAL;
+
+       flush = 0;
+       down_read(&gmap->mm->mmap_sem);
+       for (off = 0; off < len; off += PMD_SIZE) {
+               /* Walk the guest addr space page table */
+               table = gmap->table + (((to + off) >> 53) & 0x7ff);
+               if (*table & _REGION_ENTRY_INV)
+                       return 0;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 42) & 0x7ff);
+               if (*table & _REGION_ENTRY_INV)
+                       return 0;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 31) & 0x7ff);
+               if (*table & _REGION_ENTRY_INV)
+                       return 0;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 20) & 0x7ff);
+
+               /* Clear segment table entry in guest address space. */
+               flush |= gmap_unlink_segment(gmap, table);
+               *table = _SEGMENT_ENTRY_INV;
+       }
+       up_read(&gmap->mm->mmap_sem);
+       if (flush)
+               gmap_flush_tlb(gmap);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_unmap_segment);
+
+/**
+ * gmap_mmap_segment - map a segment to the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @from: source address in the parent address space
+ * @to: target address in the guest address space
+ *
+ * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
+ */
+int gmap_map_segment(struct gmap *gmap, unsigned long from,
+                    unsigned long to, unsigned long len)
+{
+       unsigned long *table;
+       unsigned long off;
+       int flush;
+
+       if ((from | to | len) & (PMD_SIZE - 1))
+               return -EINVAL;
+       if (len == 0 || from + len > PGDIR_SIZE ||
+           from + len < from || to + len < to)
+               return -EINVAL;
+
+       flush = 0;
+       down_read(&gmap->mm->mmap_sem);
+       for (off = 0; off < len; off += PMD_SIZE) {
+               /* Walk the gmap address space page table */
+               table = gmap->table + (((to + off) >> 53) & 0x7ff);
+               if ((*table & _REGION_ENTRY_INV) &&
+                   gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
+                       goto out_unmap;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 42) & 0x7ff);
+               if ((*table & _REGION_ENTRY_INV) &&
+                   gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
+                       goto out_unmap;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 31) & 0x7ff);
+               if ((*table & _REGION_ENTRY_INV) &&
+                   gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
+                       goto out_unmap;
+               table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
+               table = table + (((to + off) >> 20) & 0x7ff);
+
+               /* Store 'from' address in an invalid segment table entry. */
+               flush |= gmap_unlink_segment(gmap, table);
+               *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
+       }
+       up_read(&gmap->mm->mmap_sem);
+       if (flush)
+               gmap_flush_tlb(gmap);
+       return 0;
+
+out_unmap:
+       up_read(&gmap->mm->mmap_sem);
+       gmap_unmap_segment(gmap, to, len);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gmap_map_segment);
+
+unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
+{
+       unsigned long *table, vmaddr, segment;
+       struct mm_struct *mm;
+       struct gmap_pgtable *mp;
+       struct gmap_rmap *rmap;
+       struct vm_area_struct *vma;
+       struct page *page;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       current->thread.gmap_addr = address;
+       mm = gmap->mm;
+       /* Walk the gmap address space page table */
+       table = gmap->table + ((address >> 53) & 0x7ff);
+       if (unlikely(*table & _REGION_ENTRY_INV))
+               return -EFAULT;
+       table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+       table = table + ((address >> 42) & 0x7ff);
+       if (unlikely(*table & _REGION_ENTRY_INV))
+               return -EFAULT;
+       table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+       table = table + ((address >> 31) & 0x7ff);
+       if (unlikely(*table & _REGION_ENTRY_INV))
+               return -EFAULT;
+       table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+       table = table + ((address >> 20) & 0x7ff);
+
+       /* Convert the gmap address to an mm address. */
+       segment = *table;
+       if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
+               page = pfn_to_page(segment >> PAGE_SHIFT);
+               mp = (struct gmap_pgtable *) page->index;
+               return mp->vmaddr | (address & ~PMD_MASK);
+       } else if (segment & _SEGMENT_ENTRY_RO) {
+               vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
+               vma = find_vma(mm, vmaddr);
+               if (!vma || vma->vm_start > vmaddr)
+                       return -EFAULT;
+
+               /* Walk the parent mm page table */
+               pgd = pgd_offset(mm, vmaddr);
+               pud = pud_alloc(mm, pgd, vmaddr);
+               if (!pud)
+                       return -ENOMEM;
+               pmd = pmd_alloc(mm, pud, vmaddr);
+               if (!pmd)
+                       return -ENOMEM;
+               if (!pmd_present(*pmd) &&
+                   __pte_alloc(mm, vma, pmd, vmaddr))
+                       return -ENOMEM;
+               /* pmd now points to a valid segment table entry. */
+               rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
+               if (!rmap)
+                       return -ENOMEM;
+               /* Link gmap segment table entry location to page table. */
+               page = pmd_page(*pmd);
+               mp = (struct gmap_pgtable *) page->index;
+               rmap->entry = table;
+               list_add(&rmap->list, &mp->mapper);
+               /* Set gmap segment table entry to page table. */
+               *table = pmd_val(*pmd) & PAGE_MASK;
+               return vmaddr | (address & ~PMD_MASK);
+       }
+       return -EFAULT;
+
+}
+EXPORT_SYMBOL_GPL(gmap_fault);
+
+void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
+{
+       struct gmap_rmap *rmap, *next;
+       struct gmap_pgtable *mp;
+       struct page *page;
+       int flush;
+
+       flush = 0;
+       spin_lock(&mm->page_table_lock);
+       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       mp = (struct gmap_pgtable *) page->index;
+       list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
+               *rmap->entry =
+                       _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+               list_del(&rmap->list);
+               kfree(rmap);
+               flush = 1;
+       }
+       spin_unlock(&mm->page_table_lock);
+       if (flush)
+               __tlb_flush_global();
+}
+
+static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
+                                                   unsigned long vmaddr)
 {
        struct page *page;
        unsigned long *table;
+       struct gmap_pgtable *mp;
 
        page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
        if (!page)
                return NULL;
+       mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
+       if (!mp) {
+               __free_page(page);
+               return NULL;
+       }
        pgtable_page_ctor(page);
+       mp->vmaddr = vmaddr & PMD_MASK;
+       INIT_LIST_HEAD(&mp->mapper);
+       page->index = (unsigned long) mp;
        atomic_set(&page->_mapcount, 3);
        table = (unsigned long *) page_to_phys(page);
        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
@@ -167,24 +512,57 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
 static inline void page_table_free_pgste(unsigned long *table)
 {
        struct page *page;
+       struct gmap_pgtable *mp;
 
        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       mp = (struct gmap_pgtable *) page->index;
+       BUG_ON(!list_empty(&mp->mapper));
        pgtable_page_ctor(page);
        atomic_set(&page->_mapcount, -1);
+       kfree(mp);
        __free_page(page);
 }
-#endif
 
-unsigned long *page_table_alloc(struct mm_struct *mm)
+#else /* CONFIG_PGSTE */
+
+static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
+                                                   unsigned long vmaddr)
+{
+}
+
+static inline void page_table_free_pgste(unsigned long *table)
+{
+}
+
+static inline void gmap_unmap_notifier(struct mm_struct *mm,
+                                         unsigned long *table)
+{
+}
+
+#endif /* CONFIG_PGSTE */
+
+static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+{
+       unsigned int old, new;
+
+       do {
+               old = atomic_read(v);
+               new = old ^ bits;
+       } while (atomic_cmpxchg(v, old, new) != old);
+       return new;
+}
+
+/*
+ * page table entry allocation/free routines.
+ */
+unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
 {
        struct page *page;
        unsigned long *table;
        unsigned int mask, bit;
 
-#ifdef CONFIG_PGSTE
        if (mm_has_pgste(mm))
-               return page_table_alloc_pgste(mm);
-#endif
+               return page_table_alloc_pgste(mm, vmaddr);
        /* Allocate fragments of a 4K page as 1K/2K page table */
        spin_lock_bh(&mm->context.list_lock);
        mask = FRAG_MASK;
@@ -222,10 +600,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        struct page *page;
        unsigned int bit, mask;
 
-#ifdef CONFIG_PGSTE
-       if (mm_has_pgste(mm))
+       if (mm_has_pgste(mm)) {
+               gmap_unmap_notifier(mm, table);
                return page_table_free_pgste(table);
-#endif
+       }
        /* Free 1K/2K page table fragment of a 4K page */
        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
@@ -249,10 +627,8 @@ static void __page_table_free_rcu(void *table, unsigned bit)
 {
        struct page *page;
 
-#ifdef CONFIG_PGSTE
        if (bit == FRAG_MASK)
                return page_table_free_pgste(table);
-#endif
        /* Free 1K/2K page table fragment of a 4K page */
        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
@@ -269,13 +645,12 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
        unsigned int bit, mask;
 
        mm = tlb->mm;
-#ifdef CONFIG_PGSTE
        if (mm_has_pgste(mm)) {
+               gmap_unmap_notifier(mm, table);
                table = (unsigned long *) (__pa(table) | FRAG_MASK);
                tlb_remove_table(tlb, table);
                return;
        }
-#endif
        bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        spin_lock_bh(&mm->context.list_lock);