Merge branch 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6.git] / arch / x86 / mm / pat.c
index b2f7d3e59b86b656d49993a5870e9e47651e3d7c..d7ebc3a10f2f1aa96aa00913e4b3f2472149facc 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
@@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags)
  * areas). All the aliases have the same cache attributes of course.
  * Zero attributes are represented as holes.
  *
- * Currently the data structure is a list because the number of mappings
- * are expected to be relatively small. If this should be a problem
- * it could be changed to a rbtree or similar.
+ * The data structure is a list that is also organized as an rbtree
+ * sorted on the start address of memtype range.
  *
- * memtype_lock protects the whole list.
+ * memtype_lock protects both the linear list and rbtree.
  */
 
 struct memtype {
@@ -160,11 +160,53 @@ struct memtype {
        u64                     end;
        unsigned long           type;
        struct list_head        nd;
+       struct rb_node          rb;
 };
 
+static struct rb_root memtype_rbroot = RB_ROOT;
 static LIST_HEAD(memtype_list);
 static DEFINE_SPINLOCK(memtype_lock);  /* protects memtype list */
 
+static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
+{
+       struct rb_node *node = root->rb_node;
+       struct memtype *last_lower = NULL;
+
+       while (node) {
+               struct memtype *data = container_of(node, struct memtype, rb);
+
+               if (data->start < start) {
+                       last_lower = data;
+                       node = node->rb_right;
+               } else if (data->start > start) {
+                       node = node->rb_left;
+               } else
+                       return data;
+       }
+
+       /* Will return NULL if there is no entry with its start <= start */
+       return last_lower;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
+{
+       struct rb_node **new = &(root->rb_node);
+       struct rb_node *parent = NULL;
+
+       while (*new) {
+               struct memtype *this = container_of(*new, struct memtype, rb);
+
+               parent = *new;
+               if (data->start <= this->start)
+                       new = &((*new)->rb_left);
+               else if (data->start > this->start)
+                       new = &((*new)->rb_right);
+       }
+
+       rb_link_node(&data->rb, parent, new);
+       rb_insert_color(&data->rb, root);
+}
+
 /*
  * Does intersection of PAT memory type and MTRR memory type and returns
  * the resulting memory type as PAT understands it.
@@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
        return -EBUSY;
 }
 
-static struct memtype *cached_entry;
-static u64 cached_start;
-
 static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 {
        int ram_page = 0, not_rampage = 0;
@@ -249,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 }
 
 /*
- * For RAM pages, mark the pages as non WB memory type using
- * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
- * set_memory_wc() on a RAM page at a time before marking it as WB again.
- * This is ok, because only one driver will be owning the page and
- * doing set_memory_*() calls.
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
  *
- * For now, we use PageNonWB to track that the RAM page is being mapped
- * as non WB. In future, we will have to use one more flag
- * (or some other mechanism in page_struct) to distinguish between
- * UC and WC mapping.
+ * Caller must hold memtype_lock for atomicity.
  */
 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
                                  unsigned long *new_type)
 {
        struct page *page;
-       u64 pfn, end_pfn;
+       u64 pfn;
+
+       if (req_type == _PAGE_CACHE_UC) {
+               /* We do not support strong UC */
+               WARN_ON_ONCE(1);
+               req_type = _PAGE_CACHE_UC_MINUS;
+       }
 
        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
-               page = pfn_to_page(pfn);
-               if (page_mapped(page) || PageNonWB(page))
-                       goto out;
+               unsigned long type;
 
-               SetPageNonWB(page);
+               page = pfn_to_page(pfn);
+               type = get_page_memtype(page);
+               if (type != -1) {
+                       printk(KERN_INFO "reserve_ram_pages_type failed "
+                               "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+                               start, end, type, req_type);
+                       if (new_type)
+                               *new_type = type;
+
+                       return -EBUSY;
+               }
        }
-       return 0;
 
-out:
-       end_pfn = pfn;
-       for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+       if (new_type)
+               *new_type = req_type;
+
+       for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
                page = pfn_to_page(pfn);
-               ClearPageNonWB(page);
+               set_page_memtype(page, req_type);
        }
-
-       return -EINVAL;
+       return 0;
 }
 
 static int free_ram_pages_type(u64 start, u64 end)
 {
        struct page *page;
-       u64 pfn, end_pfn;
+       u64 pfn;
 
        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
                page = pfn_to_page(pfn);
-               if (page_mapped(page) || !PageNonWB(page))
-                       goto out;
-
-               ClearPageNonWB(page);
+               set_page_memtype(page, -1);
        }
        return 0;
-
-out:
-       end_pfn = pfn;
-       for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
-               page = pfn_to_page(pfn);
-               SetPageNonWB(page);
-       }
-       return -EINVAL;
 }
 
 /*
@@ -339,6 +376,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                if (new_type) {
                        if (req_type == -1)
                                *new_type = _PAGE_CACHE_WB;
+                       else if (req_type == _PAGE_CACHE_WC)
+                               *new_type = _PAGE_CACHE_UC_MINUS;
                        else
                                *new_type = req_type & _PAGE_CACHE_MASK;
                }
@@ -364,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                *new_type = actual_type;
 
        is_range_ram = pat_pagerange_is_ram(start, end);
-       if (is_range_ram == 1)
-               return reserve_ram_pages_type(start, end, req_type,
-                                             new_type);
-       else if (is_range_ram < 0)
+       if (is_range_ram == 1) {
+
+               spin_lock(&memtype_lock);
+               err = reserve_ram_pages_type(start, end, req_type, new_type);
+               spin_unlock(&memtype_lock);
+
+               return err;
+       } else if (is_range_ram < 0) {
                return -EINVAL;
+       }
 
        new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
        if (!new)
@@ -380,17 +424,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
        spin_lock(&memtype_lock);
 
-       if (cached_entry && start >= cached_start)
-               entry = cached_entry;
-       else
+       entry = memtype_rb_search(&memtype_rbroot, new->start);
+       if (likely(entry != NULL)) {
+               /* To work correctly with list_for_each_entry_continue */
+               entry = list_entry(entry->nd.prev, struct memtype, nd);
+       } else {
                entry = list_entry(&memtype_list, struct memtype, nd);
+       }
 
        /* Search for existing mapping that overlaps the current range */
        where = NULL;
        list_for_each_entry_continue(entry, &memtype_list, nd) {
                if (end <= entry->start) {
                        where = entry->nd.prev;
-                       cached_entry = list_entry(where, struct memtype, nd);
                        break;
                } else if (start <= entry->start) { /* end > entry->start */
                        err = chk_conflict(new, entry, new_type);
@@ -398,8 +444,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
                                        entry->start, entry->end);
                                where = entry->nd.prev;
-                               cached_entry = list_entry(where,
-                                                       struct memtype, nd);
                        }
                        break;
                } else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +451,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                        if (!err) {
                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
                                        entry->start, entry->end);
-                               cached_entry = list_entry(entry->nd.prev,
-                                                       struct memtype, nd);
 
                                /*
                                 * Move to right position in the linked
@@ -436,13 +478,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                return err;
        }
 
-       cached_start = start;
-
        if (where)
                list_add(&new->nd, where);
        else
                list_add_tail(&new->nd, &memtype_list);
 
+       memtype_rb_insert(&memtype_rbroot, new);
+
        spin_unlock(&memtype_lock);
 
        dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +496,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 int free_memtype(u64 start, u64 end)
 {
-       struct memtype *entry;
+       struct memtype *entry, *saved_entry;
        int err = -EINVAL;
        int is_range_ram;
 
@@ -466,23 +508,58 @@ int free_memtype(u64 start, u64 end)
                return 0;
 
        is_range_ram = pat_pagerange_is_ram(start, end);
-       if (is_range_ram == 1)
-               return free_ram_pages_type(start, end);
-       else if (is_range_ram < 0)
+       if (is_range_ram == 1) {
+
+               spin_lock(&memtype_lock);
+               err = free_ram_pages_type(start, end);
+               spin_unlock(&memtype_lock);
+
+               return err;
+       } else if (is_range_ram < 0) {
                return -EINVAL;
+       }
 
        spin_lock(&memtype_lock);
+
+       entry = memtype_rb_search(&memtype_rbroot, start);
+       if (unlikely(entry == NULL))
+               goto unlock_ret;
+
+       /*
+        * Saved entry points to an entry with start same or less than what
+        * we searched for. Now go through the list in both directions to look
+        * for the entry that matches with both start and end, with list stored
+        * in sorted start address
+        */
+       saved_entry = entry;
        list_for_each_entry(entry, &memtype_list, nd) {
                if (entry->start == start && entry->end == end) {
-                       if (cached_entry == entry || cached_start == start)
-                               cached_entry = NULL;
+                       rb_erase(&entry->rb, &memtype_rbroot);
+                       list_del(&entry->nd);
+                       kfree(entry);
+                       err = 0;
+                       break;
+               } else if (entry->start > start) {
+                       break;
+               }
+       }
+
+       if (!err)
+               goto unlock_ret;
 
+       entry = saved_entry;
+       list_for_each_entry_reverse(entry, &memtype_list, nd) {
+               if (entry->start == start && entry->end == end) {
+                       rb_erase(&entry->rb, &memtype_rbroot);
                        list_del(&entry->nd);
                        kfree(entry);
                        err = 0;
                        break;
+               } else if (entry->start < start) {
+                       break;
                }
        }
+unlock_ret:
        spin_unlock(&memtype_lock);
 
        if (err) {
@@ -496,6 +573,101 @@ int free_memtype(u64 start, u64 end)
 }
 
 
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+       int rettype = _PAGE_CACHE_WB;
+       struct memtype *entry;
+
+       if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+               return rettype;
+
+       if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+               struct page *page;
+               spin_lock(&memtype_lock);
+               page = pfn_to_page(paddr >> PAGE_SHIFT);
+               rettype = get_page_memtype(page);
+               spin_unlock(&memtype_lock);
+               /*
+                * -1 from get_page_memtype() implies RAM page is in its
+                * default state and not reserved, and hence of type WB
+                */
+               if (rettype == -1)
+                       rettype = _PAGE_CACHE_WB;
+
+               return rettype;
+       }
+
+       spin_lock(&memtype_lock);
+
+       entry = memtype_rb_search(&memtype_rbroot, paddr);
+       if (entry != NULL)
+               rettype = entry->type;
+       else
+               rettype = _PAGE_CACHE_UC_MINUS;
+
+       spin_unlock(&memtype_lock);
+       return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+                       unsigned long *type)
+{
+       resource_size_t size = end - start;
+       unsigned long req_type = *type;
+       unsigned long new_type;
+       int ret;
+
+       WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+       ret = reserve_memtype(start, end, req_type, &new_type);
+       if (ret)
+               goto out_err;
+
+       if (!is_new_memtype_allowed(start, size, req_type, new_type))
+               goto out_free;
+
+       if (kernel_map_sync_memtype(start, size, new_type) < 0)
+               goto out_free;
+
+       *type = new_type;
+       return 0;
+
+out_free:
+       free_memtype(start, end);
+       ret = -EBUSY;
+out_err:
+       return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+       free_memtype(start, end);
+}
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot)
 {
@@ -577,7 +749,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 {
        unsigned long id_sz;
 
-       if (!pat_enabled || base >= __pa(high_memory))
+       if (base >= __pa(high_memory))
                return 0;
 
        id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +784,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
        /*
-        * reserve_pfn_range() doesn't support RAM pages. Maintain the current
-        * behavior with RAM pages by returning success.
+        * reserve_pfn_range() for RAM pages. We do not refcount to keep
+        * track of number of mappings of RAM pages. We can assert that
+        * the type requested matches the type of first page in the range.
         */
-       if (is_ram != 0)
+       if (is_ram) {
+               if (!pat_enabled)
+                       return 0;
+
+               flags = lookup_memtype(paddr);
+               if (want_flags != flags) {
+                       printk(KERN_WARNING
+                       "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+                               current->comm, current->pid,
+                               cattr_name(want_flags),
+                               (unsigned long long)paddr,
+                               (unsigned long long)(paddr + size),
+                               cattr_name(flags));
+                       *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+                                             (~_PAGE_CACHE_MASK)) |
+                                            flags);
+               }
                return 0;
+       }
 
        ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
        if (ret)
@@ -678,14 +868,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        pgprot_t pgprot;
 
-       if (!pat_enabled)
-               return 0;
-
-       /*
-        * For now, only handle remap_pfn_range() vmas where
-        * is_linear_pfn_mapping() == TRUE. Handling of
-        * vm_insert_pfn() is TBD.
-        */
        if (is_linear_pfn_mapping(vma)) {
                /*
                 * reserve the whole chunk covered by vma. We need the
@@ -713,23 +895,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
                        unsigned long pfn, unsigned long size)
 {
+       unsigned long flags;
        resource_size_t paddr;
        unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-       if (!pat_enabled)
-               return 0;
-
-       /*
-        * For now, only handle remap_pfn_range() vmas where
-        * is_linear_pfn_mapping() == TRUE. Handling of
-        * vm_insert_pfn() is TBD.
-        */
        if (is_linear_pfn_mapping(vma)) {
                /* reserve the whole chunk starting from vm_pgoff */
                paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
                return reserve_pfn_range(paddr, vma_size, prot, 0);
        }
 
+       if (!pat_enabled)
+               return 0;
+
+       /* for vm_insert_pfn and friends, we set prot based on lookup */
+       flags = lookup_memtype(pfn << PAGE_SHIFT);
+       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+                        flags);
+
        return 0;
 }
 
@@ -744,14 +927,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
        resource_size_t paddr;
        unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-       if (!pat_enabled)
-               return;
-
-       /*
-        * For now, only handle remap_pfn_range() vmas where
-        * is_linear_pfn_mapping() == TRUE. Handling of
-        * vm_insert_pfn() is TBD.
-        */
        if (is_linear_pfn_mapping(vma)) {
                /* free the whole chunk starting from vm_pgoff */
                paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;