Merge branch 'akpm' (Andrew's patchbomb)
Linus Torvalds [Wed, 12 Dec 2012 02:05:37 +0000 (18:05 -0800)]
Merge misc updates from Andrew Morton:
 "About half of most of MM.  Going very early this time due to
  uncertainty over the coreautounifiednumasched things.  I'll send the
  other half of most of MM tomorrow.  The rest of MM awaits a slab merge
  from Pekka."

* emailed patches from Andrew Morton: (71 commits)
  memory_hotplug: ensure every online node has NORMAL memory
  memory_hotplug: handle empty zone when online_movable/online_kernel
  mm, memory-hotplug: dynamic configure movable memory and portion memory
  drivers/base/node.c: cleanup node_state_attr[]
  bootmem: fix wrong call parameter for free_bootmem()
  avr32, kconfig: remove HAVE_ARCH_BOOTMEM
  mm: cma: remove watermark hacks
  mm: cma: skip watermarks check for already isolated blocks in split_free_page()
  mm, oom: fix race when specifying a thread as the oom origin
  mm, oom: change type of oom_score_adj to short
  mm: cleanup register_node()
  mm, mempolicy: remove duplicate code
  mm/vmscan.c: try_to_freeze() returns boolean
  mm: introduce putback_movable_pages()
  virtio_balloon: introduce migration primitives to balloon pages
  mm: introduce compaction and migration for ballooned pages
  mm: introduce a common interface for balloon pages mobility
  mm: redefine address_space.assoc_mapping
  mm: adjust address_space_operations.migratepage() return code
  arch/sparc/kernel/sys_sparc_64.c: s/COLOUR/COLOR/
  ...

96 files changed:
Documentation/cgroups/memory.txt
Documentation/memory-hotplug.txt
arch/alpha/include/asm/mman.h
arch/arm/mm/mmap.c
arch/avr32/Kconfig
arch/mips/include/uapi/asm/mman.h
arch/mips/mm/mmap.c
arch/parisc/include/uapi/asm/mman.h
arch/powerpc/kernel/sysfs.c
arch/powerpc/platforms/cell/celleb_pci.c
arch/s390/include/asm/page.h
arch/sh/mm/mmap.c
arch/sparc/kernel/sys_sparc_32.c
arch/sparc/kernel/sys_sparc_64.c
arch/sparc/mm/hugetlbpage.c
arch/tile/mm/hugetlbpage.c
arch/x86/include/asm/elf.h
arch/x86/include/asm/mman.h
arch/x86/kernel/sys_x86_64.c
arch/x86/mm/hugetlbpage.c
arch/x86/vdso/vma.c
arch/xtensa/include/uapi/asm/mman.h
drivers/base/memory.c
drivers/base/node.c
drivers/macintosh/smu.c
drivers/staging/android/lowmemorykiller.c
drivers/virtio/virtio_balloon.c
fs/btrfs/disk-io.c
fs/btrfs/file.c
fs/btrfs/ioctl.c
fs/buffer.c
fs/gfs2/glock.c
fs/hugetlbfs/inode.c
fs/inode.c
fs/nilfs2/page.c
fs/ocfs2/file.c
fs/proc/base.c
fs/splice.c
include/linux/balloon_compaction.h [new file with mode: 0644]
include/linux/bootmem.h
include/linux/fs.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/kernel.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/node.h
include/linux/oom.h
include/linux/page-isolation.h
include/linux/pagemap.h
include/linux/sched.h
include/linux/shm.h
include/linux/types.h
include/linux/writeback.h
include/trace/events/oom.h
include/trace/events/task.h
include/uapi/asm-generic/mman-common.h
include/uapi/asm-generic/mman.h
ipc/shm.c
lib/cpumask.c
mm/Kconfig
mm/Makefile
mm/balloon_compaction.c [new file with mode: 0644]
mm/bootmem.c
mm/compaction.c
mm/dmapool.c
mm/highmem.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/ksm.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mmap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_cgroup.c
mm/page_isolation.c
mm/rmap.c
mm/slub.c
mm/sparse.c
mm/swapfile.c
mm/vmalloc.c
mm/vmscan.c
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/thuge-gen.c [new file with mode: 0644]

index 71c4da4..a25cb3f 100644 (file)
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller
 3. Each page has a pointer to the page_cgroup, which in turn knows the
    cgroup it belongs to
 
-The accounting is done as follows: mem_cgroup_charge() is invoked to set up
-the necessary data structures and check if the cgroup that is being charged
-is over its limit. If it is, then reclaim is invoked on the cgroup.
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
 More details can be found in the reclaim section of this document.
 If everything goes well, a page meta-data-structure called page_cgroup is
 updated. page_cgroup has its own LRU on cgroup.
index 6d0c251..c6f993d 100644 (file)
@@ -161,7 +161,8 @@ a recent addition and not present on older kernels.
                    in the memory block.
 'state'           : read-write
                     at read:  contains online/offline state of memory.
-                    at write: user can specify "online", "offline" command
+                    at write: user can specify "online_kernel",
+                    "online_movable", "online", "offline" command
                     which will be performed on al sections in the block.
 'phys_device'     : read-only: designed to show the name of physical memory
                     device.  This is not well implemented now.
@@ -255,6 +256,17 @@ For onlining, you have to write "online" to the section's state file as:
 
 % echo online > /sys/devices/system/memory/memoryXXX/state
 
+This onlining will not change the ZONE type of the target memory section,
+If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
+
+% echo online_movable > /sys/devices/system/memory/memoryXXX/state
+(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE)
+
+And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
+
+% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
+(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL)
+
 After this, section memoryXXX's state will be 'online' and the amount of
 available memory will be increased.
 
@@ -377,15 +389,18 @@ The third argument is passed by pointer of struct memory_notify.
 struct memory_notify {
        unsigned long start_pfn;
        unsigned long nr_pages;
+       int status_change_nid_normal;
        int status_change_nid;
 }
 
 start_pfn is start_pfn of online/offline memory.
 nr_pages is # of pages of online/offline memory.
+status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
+is (will be) set/clear, if this is -1, then nodemask status is not changed.
 status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
 set/clear. It means a new(memoryless) node gets new memory by online and a
 node loses all memory. If this is -1, then nodemask status is not changed.
-If status_changed_nid >= 0, callback should create/discard structures for the
+If status_changed_nid* >= 0, callback should create/discard structures for the
 node if necessary.
 
 --------------
index cbeb361..0086b47 100644 (file)
 /* compatibility flags */
 #define MAP_FILE       0
 
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK  0x3f
+
 #endif /* __ALPHA_MMAN_H__ */
index 89f2b7f..10062ce 100644 (file)
 #include <linux/random.h>
 #include <asm/cachetype.h>
 
-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
-                                             unsigned long pgoff)
-{
-       unsigned long base = addr & ~(SHMLBA-1);
-       unsigned long off = (pgoff << PAGE_SHIFT) & (SHMLBA-1);
-
-       if (base + off <= addr)
-               return base + off;
-
-       return base - off;
-}
-
 #define COLOUR_ALIGN(addr,pgoff)               \
        ((((addr)+SHMLBA-1)&~(SHMLBA-1)) +      \
         (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
@@ -69,9 +57,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-       unsigned long start_addr;
        int do_align = 0;
        int aliasing = cache_is_vipt_aliasing();
+       struct vm_unmapped_area_info info;
 
        /*
         * We only need to do colour alignment if either the I or D
@@ -104,46 +92,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               start_addr = addr = mm->mmap_base;
-               mm->cached_hole_size = 0;
-       }
 
-full_search:
-       if (do_align)
-               addr = COLOUR_ALIGN(addr, pgoff);
-       else
-               addr = PAGE_ALIGN(addr);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = vma->vm_end;
-               if (do_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = mm->mmap_base;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       return vm_unmapped_area(&info);
 }
 
 unsigned long
@@ -156,6 +112,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        unsigned long addr = addr0;
        int do_align = 0;
        int aliasing = cache_is_vipt_aliasing();
+       struct vm_unmapped_area_info info;
 
        /*
         * We only need to do colour alignment if either the I or D
@@ -187,70 +144,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache;
-       if (do_align) {
-               unsigned long base = COLOUR_ALIGN_DOWN(addr - len, pgoff);
-               addr = base + len;
-       }
-
-       /* make sure it can fit in the remaining address space */
-       if (addr > len) {
-               vma = find_vma(mm, addr-len);
-               if (!vma || addr <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr-len);
-       }
-
-       if (mm->mmap_base < len)
-               goto bottomup;
-
-       addr = mm->mmap_base - len;
-       if (do_align)
-               addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (!vma || addr+len <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       addr = vm_unmapped_area(&info);
 
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start - len;
-               if (do_align)
-                       addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-       } while (len < vma->vm_start);
-
-bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = mm->mmap_base;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index 06e73bf..c2bbc9a 100644 (file)
@@ -193,9 +193,6 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
        def_bool y
 
-config HAVE_ARCH_BOOTMEM
-       def_bool n
-
 config ARCH_HAVE_MEMORY_PRESENT
        def_bool n
 
index 46d3da0..9a936ac 100644 (file)
 /* compatibility flags */
 #define MAP_FILE       0
 
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK  0x3f
+
 #endif /* _ASM_MMAN_H */
index 302d779..d9be754 100644 (file)
@@ -45,18 +45,6 @@ static unsigned long mmap_base(unsigned long rnd)
        return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 
-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
-                                             unsigned long pgoff)
-{
-       unsigned long base = addr & ~shm_align_mask;
-       unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
-
-       if (base + off <= addr)
-               return base + off;
-
-       return base - off;
-}
-
 #define COLOUR_ALIGN(addr, pgoff)                              \
        ((((addr) + shm_align_mask) & ~shm_align_mask) +        \
         (((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -71,6 +59,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
        struct vm_area_struct *vma;
        unsigned long addr = addr0;
        int do_color_align;
+       struct vm_unmapped_area_info info;
 
        if (unlikely(len > TASK_SIZE))
                return -ENOMEM;
@@ -107,97 +96,31 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
                        return addr;
        }
 
-       if (dir == UP) {
-               addr = mm->mmap_base;
-               if (do_color_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
-               else
-                       addr = PAGE_ALIGN(addr);
+       info.length = len;
+       info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
 
-               for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
-                       /* At this point:  (!vma || addr < vma->vm_end). */
-                       if (TASK_SIZE - len < addr)
-                               return -ENOMEM;
-                       if (!vma || addr + len <= vma->vm_start)
-                               return addr;
-                       addr = vma->vm_end;
-                       if (do_color_align)
-                               addr = COLOUR_ALIGN(addr, pgoff);
-                }
-        } else {
-               /* check if free_area_cache is useful for us */
-               if (len <= mm->cached_hole_size) {
-                       mm->cached_hole_size = 0;
-                       mm->free_area_cache = mm->mmap_base;
-               }
+       if (dir == DOWN) {
+               info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+               info.low_limit = PAGE_SIZE;
+               info.high_limit = mm->mmap_base;
+               addr = vm_unmapped_area(&info);
+
+               if (!(addr & ~PAGE_MASK))
+                       return addr;
 
-               /*
-                * either no address requested, or the mapping can't fit into
-                * the requested address hole
-                */
-               addr = mm->free_area_cache;
-               if (do_color_align) {
-                       unsigned long base =
-                               COLOUR_ALIGN_DOWN(addr - len, pgoff);
-                       addr = base + len;
-               }
-
-               /* make sure it can fit in the remaining address space */
-               if (likely(addr > len)) {
-                       vma = find_vma(mm, addr - len);
-                       if (!vma || addr <= vma->vm_start) {
-                               /* cache the address as a hint for next time */
-                               return mm->free_area_cache = addr - len;
-                       }
-               }
-
-               if (unlikely(mm->mmap_base < len))
-                       goto bottomup;
-
-               addr = mm->mmap_base - len;
-               if (do_color_align)
-                       addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
-               do {
-                       /*
-                        * Lookup failure means no vma is above this address,
-                        * else if new region fits below vma->vm_start,
-                        * return with success:
-                        */
-                       vma = find_vma(mm, addr);
-                       if (likely(!vma || addr + len <= vma->vm_start)) {
-                               /* cache the address as a hint for next time */
-                               return mm->free_area_cache = addr;
-                       }
-
-                       /* remember the largest hole we saw so far */
-                       if (addr + mm->cached_hole_size < vma->vm_start)
-                               mm->cached_hole_size = vma->vm_start - addr;
-
-                       /* try just below the current vma->vm_start */
-                       addr = vma->vm_start - len;
-                       if (do_color_align)
-                               addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-               } while (likely(len < vma->vm_start));
-
-bottomup:
                /*
                 * A failed mmap() very likely causes application failure,
                 * so fall back to the bottom-up function here. This scenario
                 * can happen with large stack limits and large mmap()
                 * allocations.
                 */
-               mm->cached_hole_size = ~0UL;
-               mm->free_area_cache = TASK_UNMAPPED_BASE;
-               addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-               /*
-                * Restore the topdown base:
-                */
-               mm->free_area_cache = mm->mmap_base;
-               mm->cached_hole_size = ~0UL;
-
-               return addr;
        }
+
+       info.flags = 0;
+       info.low_limit = mm->mmap_base;
+       info.high_limit = TASK_SIZE;
+       return vm_unmapped_area(&info);
 }
 
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
index 12219eb..294d251 100644 (file)
 #define MAP_FILE       0
 #define MAP_VARIABLE   0
 
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK  0x3f
+
 #endif /* __PARISC_MMAN_H__ */
index cf357a0..3ce1f86 100644 (file)
@@ -607,7 +607,7 @@ static void register_nodes(void)
 
 int sysfs_add_device_to_node(struct device *dev, int nid)
 {
-       struct node *node = &node_devices[nid];
+       struct node *node = node_devices[nid];
        return sysfs_create_link(&node->dev.kobj, &dev->kobj,
                        kobject_name(&dev->kobj));
 }
@@ -615,7 +615,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
 
 void sysfs_remove_device_from_node(struct device *dev, int nid)
 {
-       struct node *node = &node_devices[nid];
+       struct node *node = node_devices[nid];
        sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
index abc8af4..1735681 100644 (file)
@@ -401,11 +401,11 @@ error:
        } else {
                if (config && *config) {
                        size = 256;
-                       free_bootmem((unsigned long)(*config), size);
+                       free_bootmem(__pa(*config), size);
                }
                if (res && *res) {
                        size = sizeof(struct celleb_pci_resource);
-                       free_bootmem((unsigned long)(*res), size);
+                       free_bootmem(__pa(*res), size);
                }
        }
 
index 6d53670..39faa4a 100644 (file)
@@ -158,6 +158,9 @@ static inline int page_reset_referenced(unsigned long addr)
  * race against modification of the referenced bit. This function
  * should therefore only be called if it is not mapped in any
  * address space.
+ *
+ * Note that the bit gets set whenever page content is changed. That means
+ * also when the page is modified by DMA or from inside the kernel.
  */
 #define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
 static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped)
index 80bf494..6777177 100644 (file)
@@ -30,25 +30,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr,
        return base + off;
 }
 
-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
-                                             unsigned long pgoff)
-{
-       unsigned long base = addr & ~shm_align_mask;
-       unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
-
-       if (base + off <= addr)
-               return base + off;
-
-       return base - off;
-}
-
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
        unsigned long len, unsigned long pgoff, unsigned long flags)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-       unsigned long start_addr;
        int do_colour_align;
+       struct vm_unmapped_area_info info;
 
        if (flags & MAP_FIXED) {
                /* We do not accept a shared mapping if it would violate
@@ -79,47 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
                        return addr;
        }
 
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               mm->cached_hole_size = 0;
-               start_addr = addr = TASK_UNMAPPED_BASE;
-       }
-
-full_search:
-       if (do_colour_align)
-               addr = COLOUR_ALIGN(addr, pgoff);
-       else
-               addr = PAGE_ALIGN(mm->free_area_cache);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (unlikely(TASK_SIZE - len < addr)) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (likely(!vma || addr + len <= vma->vm_start)) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               addr = vma->vm_end;
-               if (do_colour_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       return vm_unmapped_area(&info);
 }
 
 unsigned long
@@ -131,6 +85,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
        int do_colour_align;
+       struct vm_unmapped_area_info info;
 
        if (flags & MAP_FIXED) {
                /* We do not accept a shared mapping if it would violate
@@ -162,73 +117,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache;
-       if (do_colour_align) {
-               unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       addr = vm_unmapped_area(&info);
 
-               addr = base + len;
-       }
-
-       /* make sure it can fit in the remaining address space */
-       if (likely(addr > len)) {
-               vma = find_vma(mm, addr-len);
-               if (!vma || addr <= vma->vm_start) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr-len);
-               }
-       }
-
-       if (unlikely(mm->mmap_base < len))
-               goto bottomup;
-
-       addr = mm->mmap_base-len;
-       if (do_colour_align)
-               addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (likely(!vma || addr+len <= vma->vm_start)) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
-               }
-
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start-len;
-               if (do_colour_align)
-                       addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-       } while (likely(len < vma->vm_start));
-
-bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index 0c9b31b..57277c8 100644 (file)
@@ -34,11 +34,9 @@ asmlinkage unsigned long sys_getpagesize(void)
        return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */
 }
 
-#define COLOUR_ALIGN(addr)      (((addr)+SHMLBA-1)&~(SHMLBA-1))
-
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
-       struct vm_area_struct * vmm;
+       struct vm_unmapped_area_info info;
 
        if (flags & MAP_FIXED) {
                /* We do not accept a shared mapping if it would violate
@@ -56,21 +54,14 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
        if (!addr)
                addr = TASK_UNMAPPED_BASE;
 
-       if (flags & MAP_SHARED)
-               addr = COLOUR_ALIGN(addr);
-       else
-               addr = PAGE_ALIGN(addr);
-
-       for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
-               /* At this point:  (!vmm || addr < vmm->vm_end). */
-               if (TASK_SIZE - PAGE_SIZE - len < addr)
-                       return -ENOMEM;
-               if (!vmm || addr + len <= vmm->vm_start)
-                       return addr;
-               addr = vmm->vm_end;
-               if (flags & MAP_SHARED)
-                       addr = COLOUR_ALIGN(addr);
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = addr;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = (flags & MAP_SHARED) ?
+               (PAGE_MASK & (SHMLBA - 1)) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       return vm_unmapped_area(&info);
 }
 
 /*
index 878ef3d..97309c0 100644 (file)
@@ -75,7 +75,7 @@ static inline int invalid_64bit_range(unsigned long addr, unsigned long len)
  *    the spitfire/niagara VA-hole.
  */
 
-static inline unsigned long COLOUR_ALIGN(unsigned long addr,
+static inline unsigned long COLOR_ALIGN(unsigned long addr,
                                         unsigned long pgoff)
 {
        unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1);
@@ -84,24 +84,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr,
        return base + off;
 }
 
-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
-                                             unsigned long pgoff)
-{
-       unsigned long base = addr & ~(SHMLBA-1);
-       unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
-
-       if (base + off <= addr)
-               return base + off;
-       return base - off;
-}
-
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct * vma;
        unsigned long task_size = TASK_SIZE;
-       unsigned long start_addr;
        int do_color_align;
+       struct vm_unmapped_area_info info;
 
        if (flags & MAP_FIXED) {
                /* We do not accept a shared mapping if it would violate
@@ -124,7 +113,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 
        if (addr) {
                if (do_color_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
+                       addr = COLOR_ALIGN(addr, pgoff);
                else
                        addr = PAGE_ALIGN(addr);
 
@@ -134,50 +123,22 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
                        return addr;
        }
 
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               start_addr = addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = min(task_size, VA_EXCLUDE_START);
+       info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       addr = vm_unmapped_area(&info);
+
+       if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.low_limit = VA_EXCLUDE_END;
+               info.high_limit = task_size;
+               addr = vm_unmapped_area(&info);
        }
 
-       task_size -= len;
-
-full_search:
-       if (do_color_align)
-               addr = COLOUR_ALIGN(addr, pgoff);
-       else
-               addr = PAGE_ALIGN(addr);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (addr < VA_EXCLUDE_START &&
-                   (addr + len) >= VA_EXCLUDE_START) {
-                       addr = VA_EXCLUDE_END;
-                       vma = find_vma(mm, VA_EXCLUDE_END);
-               }
-               if (unlikely(task_size < addr)) {
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (likely(!vma || addr + len <= vma->vm_start)) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               addr = vma->vm_end;
-               if (do_color_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
-       }
+       return addr;
 }
 
 unsigned long
@@ -190,6 +151,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        unsigned long task_size = STACK_TOP32;
        unsigned long addr = addr0;
        int do_color_align;
+       struct vm_unmapped_area_info info;
 
        /* This should only ever run for 32-bit processes.  */
        BUG_ON(!test_thread_flag(TIF_32BIT));
@@ -214,7 +176,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        /* requesting a specific address */
        if (addr) {
                if (do_color_align)
-                       addr = COLOUR_ALIGN(addr, pgoff);
+                       addr = COLOR_ALIGN(addr, pgoff);
                else
                        addr = PAGE_ALIGN(addr);
 
@@ -224,73 +186,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache;
-       if (do_color_align) {
-               unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       addr = vm_unmapped_area(&info);
 
-               addr = base + len;
-       }
-
-       /* make sure it can fit in the remaining address space */
-       if (likely(addr > len)) {
-               vma = find_vma(mm, addr-len);
-               if (!vma || addr <= vma->vm_start) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr-len);
-               }
-       }
-
-       if (unlikely(mm->mmap_base < len))
-               goto bottomup;
-
-       addr = mm->mmap_base-len;
-       if (do_color_align)
-               addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (likely(!vma || addr+len <= vma->vm_start)) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
-               }
-
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start-len;
-               if (do_color_align)
-                       addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-       } while (likely(len < vma->vm_start));
-
-bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = STACK_TOP32;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index f76f83d..d2b5944 100644 (file)
@@ -30,55 +30,28 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
                                                        unsigned long pgoff,
                                                        unsigned long flags)
 {
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct * vma;
        unsigned long task_size = TASK_SIZE;
-       unsigned long start_addr;
+       struct vm_unmapped_area_info info;
 
        if (test_thread_flag(TIF_32BIT))
                task_size = STACK_TOP32;
-       if (unlikely(len >= VA_EXCLUDE_START))
-               return -ENOMEM;
 
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               start_addr = addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = min(task_size, VA_EXCLUDE_START);
+       info.align_mask = PAGE_MASK & ~HPAGE_MASK;
+       info.align_offset = 0;
+       addr = vm_unmapped_area(&info);
+
+       if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.low_limit = VA_EXCLUDE_END;
+               info.high_limit = task_size;
+               addr = vm_unmapped_area(&info);
        }
 
-       task_size -= len;
-
-full_search:
-       addr = ALIGN(addr, HPAGE_SIZE);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (addr < VA_EXCLUDE_START &&
-                   (addr + len) >= VA_EXCLUDE_START) {
-                       addr = VA_EXCLUDE_END;
-                       vma = find_vma(mm, VA_EXCLUDE_END);
-               }
-               if (unlikely(task_size < addr)) {
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (likely(!vma || addr + len <= vma->vm_start)) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-       }
+       return addr;
 }
 
 static unsigned long
@@ -87,71 +60,34 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                                  const unsigned long pgoff,
                                  const unsigned long flags)
 {
-       struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
+       struct vm_unmapped_area_info info;
 
        /* This should only ever run for 32-bit processes.  */
        BUG_ON(!test_thread_flag(TIF_32BIT));
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache & HPAGE_MASK;
-
-       /* make sure it can fit in the remaining address space */
-       if (likely(addr > len)) {
-               vma = find_vma(mm, addr-len);
-               if (!vma || addr <= vma->vm_start) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr-len);
-               }
-       }
-
-       if (unlikely(mm->mmap_base < len))
-               goto bottomup;
-
-       addr = (mm->mmap_base-len) & HPAGE_MASK;
-
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (likely(!vma || addr+len <= vma->vm_start)) {
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
-               }
-
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = (vma->vm_start-len) & HPAGE_MASK;
-       } while (likely(len < vma->vm_start));
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = PAGE_MASK & ~HPAGE_MASK;
+       info.align_offset = 0;
+       addr = vm_unmapped_area(&info);
 
-bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = STACK_TOP32;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index 812e2d0..650ccff 100644 (file)
@@ -231,42 +231,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long pgoff, unsigned long flags)
 {
        struct hstate *h = hstate_file(file);
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long start_addr;
-
-       if (len > mm->cached_hole_size) {
-               start_addr = mm->free_area_cache;
-       } else {
-               start_addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
-
-full_search:
-       addr = ALIGN(start_addr, huge_page_size(h));
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = ALIGN(vma->vm_end, huge_page_size(h));
-       }
+       struct vm_unmapped_area_info info;
+
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -274,92 +247,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                unsigned long pgoff, unsigned long flags)
 {
        struct hstate *h = hstate_file(file);
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma, *prev_vma;
-       unsigned long base = mm->mmap_base, addr = addr0;
-       unsigned long largest_hole = mm->cached_hole_size;
-       int first_time = 1;
-
-       /* don't allow allocations above current base */
-       if (mm->free_area_cache > base)
-               mm->free_area_cache = base;
-
-       if (len <= largest_hole) {
-               largest_hole = 0;
-               mm->free_area_cache  = base;
-       }
-try_again:
-       /* make sure it can fit in the remaining address space */
-       if (mm->free_area_cache < len)
-               goto fail;
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = (mm->free_area_cache - len) & huge_page_mask(h);
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * i.e. return with success:
-                */
-               vma = find_vma_prev(mm, addr, &prev_vma);
-               if (!vma) {
-                       return addr;
-                       break;
-               }
-
-               /*
-                * new region fits between prev_vma->vm_end and
-                * vma->vm_start, use it:
-                */
-               if (addr + len <= vma->vm_start &&
-                           (!prev_vma || (addr >= prev_vma->vm_end))) {
-                       /* remember the address as a hint for next time */
-                       mm->cached_hole_size = largest_hole;
-                       mm->free_area_cache = addr;
-                       return addr;
-               } else {
-                       /* pull free_area_cache down to the first hole */
-                       if (mm->free_area_cache == vma->vm_end) {
-                               mm->free_area_cache = vma->vm_start;
-                               mm->cached_hole_size = largest_hole;
-                       }
-               }
+       struct vm_unmapped_area_info info;
+       unsigned long addr;
 
-               /* remember the largest hole we saw so far */
-               if (addr + largest_hole < vma->vm_start)
-                       largest_hole = vma->vm_start - addr;
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = current->mm->mmap_base;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       addr = vm_unmapped_area(&info);
 
-               /* try just below the current vma->vm_start */
-               addr = (vma->vm_start - len) & huge_page_mask(h);
-
-       } while (len <= vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        */
-       if (first_time) {
-               mm->free_area_cache = base;
-               largest_hole = 0;
-               first_time = 0;
-               goto try_again;
-       }
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       mm->cached_hole_size = ~0UL;
-       addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
-                       len, pgoff, flags);
-
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index 5939f44..9c999c1 100644 (file)
@@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void)
        return 0;
 }
 
-/* The first two values are special, do not change. See align_addr() */
+/* Do not change the values. See get_align_mask() */
 enum align_flags {
        ALIGN_VA_32     = BIT(0),
        ALIGN_VA_64     = BIT(1),
-       ALIGN_VDSO      = BIT(2),
-       ALIGN_TOPDOWN   = BIT(3),
 };
 
 struct va_alignment {
@@ -368,5 +366,5 @@ struct va_alignment {
 } ____cacheline_aligned;
 
 extern struct va_alignment va_align;
-extern unsigned long align_addr(unsigned long, struct file *, enum align_flags);
+extern unsigned long align_vdso_addr(unsigned long);
 #endif /* _ASM_X86_ELF_H */
index 593e51d..513b05f 100644 (file)
@@ -3,6 +3,9 @@
 
 #define MAP_32BIT      0x40            /* only give out 32bit addresses */
 
+#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+
 #include <asm-generic/mman.h>
 
 #endif /* _ASM_X86_MMAN_H */
index b4d3c39..97ef74b 100644 (file)
 
 /*
  * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
- *
- * @flags denotes the allocation direction - bottomup or topdown -
- * or vDSO; see call sites below.
  */
-unsigned long align_addr(unsigned long addr, struct file *filp,
-                        enum align_flags flags)
+static unsigned long get_align_mask(void)
 {
-       unsigned long tmp_addr;
-
        /* handle 32- and 64-bit case with a single conditional */
        if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
-               return addr;
+               return 0;
 
        if (!(current->flags & PF_RANDOMIZE))
-               return addr;
-
-       if (!((flags & ALIGN_VDSO) || filp))
-               return addr;
-
-       tmp_addr = addr;
-
-       /*
-        * We need an address which is <= than the original
-        * one only when in topdown direction.
-        */
-       if (!(flags & ALIGN_TOPDOWN))
-               tmp_addr += va_align.mask;
+               return 0;
 
-       tmp_addr &= ~va_align.mask;
+       return va_align.mask;
+}
 
-       return tmp_addr;
+unsigned long align_vdso_addr(unsigned long addr)
+{
+       unsigned long align_mask = get_align_mask();
+       return (addr + align_mask) & ~align_mask;
 }
 
 static int __init control_va_addr_alignment(char *str)
@@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-       unsigned long start_addr;
+       struct vm_unmapped_area_info info;
        unsigned long begin, end;
 
        if (flags & MAP_FIXED)
@@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
-       if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
-           && len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = begin;
-       }
-       addr = mm->free_area_cache;
-       if (addr < begin)
-               addr = begin;
-       start_addr = addr;
-
-full_search:
-
-       addr = align_addr(addr, filp, 0);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (end - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != begin) {
-                               start_addr = addr = begin;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
 
-               addr = vma->vm_end;
-               addr = align_addr(addr, filp, 0);
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = begin;
+       info.high_limit = end;
+       info.align_mask = filp ? get_align_mask() : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       return vm_unmapped_area(&info);
 }
 
-
 unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                          const unsigned long len, const unsigned long pgoff,
@@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-       unsigned long addr = addr0, start_addr;
+       unsigned long addr = addr0;
+       struct vm_unmapped_area_info info;
 
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-try_again:
-       /* either no address requested or can't fit in requested address hole */
-       start_addr = addr = mm->free_area_cache;
-
-       if (addr < len)
-               goto fail;
-
-       addr -= len;
-       do {
-               addr = align_addr(addr, filp, ALIGN_TOPDOWN);
-
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (!vma || addr+len <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return mm->free_area_cache = addr;
-
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start-len;
-       } while (len < vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        */
-       if (start_addr != mm->mmap_base) {
-               mm->free_area_cache = mm->mmap_base;
-               mm->cached_hole_size = 0;
-               goto try_again;
-       }
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = filp ? get_align_mask() : 0;
+       info.align_offset = pgoff << PAGE_SHIFT;
+       addr = vm_unmapped_area(&info);
+       if (!(addr & ~PAGE_MASK))
+               return addr;
+       VM_BUG_ON(addr != -ENOMEM);
 
 bottomup:
        /*
@@ -270,14 +188,5 @@ bottomup:
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
-
-       return addr;
+       return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 }
index 937bff5..ae1aa71 100644 (file)
@@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long pgoff, unsigned long flags)
 {
        struct hstate *h = hstate_file(file);
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long start_addr;
-
-       if (len > mm->cached_hole_size) {
-               start_addr = mm->free_area_cache;
-       } else {
-               start_addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
-
-full_search:
-       addr = ALIGN(start_addr, huge_page_size(h));
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = ALIGN(vma->vm_end, huge_page_size(h));
-       }
+       struct vm_unmapped_area_info info;
+
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                unsigned long pgoff, unsigned long flags)
 {
        struct hstate *h = hstate_file(file);
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long base = mm->mmap_base;
-       unsigned long addr = addr0;
-       unsigned long largest_hole = mm->cached_hole_size;
-       unsigned long start_addr;
-
-       /* don't allow allocations above current base */
-       if (mm->free_area_cache > base)
-               mm->free_area_cache = base;
-
-       if (len <= largest_hole) {
-               largest_hole = 0;
-               mm->free_area_cache  = base;
-       }
-try_again:
-       start_addr = mm->free_area_cache;
-
-       /* make sure it can fit in the remaining address space */
-       if (mm->free_area_cache < len)
-               goto fail;
-
-       /* either no address requested or can't fit in requested address hole */
-       addr = (mm->free_area_cache - len) & huge_page_mask(h);
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * i.e. return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (!vma)
-                       return addr;
+       struct vm_unmapped_area_info info;
+       unsigned long addr;
 
-               if (addr + len <= vma->vm_start) {
-                       /* remember the address as a hint for next time */
-                       mm->cached_hole_size = largest_hole;
-                       return (mm->free_area_cache = addr);
-               } else if (mm->free_area_cache == vma->vm_end) {
-                       /* pull free_area_cache down to the first hole */
-                       mm->free_area_cache = vma->vm_start;
-                       mm->cached_hole_size = largest_hole;
-               }
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = current->mm->mmap_base;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       addr = vm_unmapped_area(&info);
 
-               /* remember the largest hole we saw so far */
-               if (addr + largest_hole < vma->vm_start)
-                       largest_hole = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = (vma->vm_start - len) & huge_page_mask(h);
-       } while (len <= vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        */
-       if (start_addr != base) {
-               mm->free_area_cache = base;
-               largest_hole = 0;
-               goto try_again;
-       }
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       mm->cached_hole_size = ~0UL;
-       addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
-                       len, pgoff, flags);
-
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
index 00aaf04..431e875 100644 (file)
@@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
         * unaligned here as a result of stack start randomization.
         */
        addr = PAGE_ALIGN(addr);
-       addr = align_addr(addr, NULL, ALIGN_VDSO);
+       addr = align_vdso_addr(addr);
 
        return addr;
 }
index 25bc6c1..00eed67 100644 (file)
 /* compatibility flags */
 #define MAP_FILE       0
 
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK  0x3f
+
 #endif /* _XTENSA_MMAN_H */
index 86c8821..987604d 100644 (file)
@@ -70,6 +70,13 @@ void unregister_memory_isolate_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+static void memory_block_release(struct device *dev)
+{
+       struct memory_block *mem = container_of(dev, struct memory_block, dev);
+
+       kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -80,6 +87,7 @@ int register_memory(struct memory_block *memory)
 
        memory->dev.bus = &memory_subsys;
        memory->dev.id = memory->start_section_nr / sections_per_block;
+       memory->dev.release = memory_block_release;
 
        error = device_register(&memory->dev);
        return error;
@@ -246,7 +254,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn,
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long phys_index, unsigned long action)
+memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
 {
        unsigned long start_pfn;
        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
@@ -261,7 +269,7 @@ memory_block_action(unsigned long phys_index, unsigned long action)
                        if (!pages_correctly_reserved(start_pfn, nr_pages))
                                return -EBUSY;
 
-                       ret = online_pages(start_pfn, nr_pages);
+                       ret = online_pages(start_pfn, nr_pages, online_type);
                        break;
                case MEM_OFFLINE:
                        ret = offline_pages(start_pfn, nr_pages);
@@ -276,7 +284,8 @@ memory_block_action(unsigned long phys_index, unsigned long action)
 }
 
 static int __memory_block_change_state(struct memory_block *mem,
-               unsigned long to_state, unsigned long from_state_req)
+               unsigned long to_state, unsigned long from_state_req,
+               int online_type)
 {
        int ret = 0;
 
@@ -288,7 +297,7 @@ static int __memory_block_change_state(struct memory_block *mem,
        if (to_state == MEM_OFFLINE)
                mem->state = MEM_GOING_OFFLINE;
 
-       ret = memory_block_action(mem->start_section_nr, to_state);
+       ret = memory_block_action(mem->start_section_nr, to_state, online_type);
 
        if (ret) {
                mem->state = from_state_req;
@@ -311,12 +320,14 @@ out:
 }
 
 static int memory_block_change_state(struct memory_block *mem,
-               unsigned long to_state, unsigned long from_state_req)
+               unsigned long to_state, unsigned long from_state_req,
+               int online_type)
 {
        int ret;
 
        mutex_lock(&mem->state_mutex);
-       ret = __memory_block_change_state(mem, to_state, from_state_req);
+       ret = __memory_block_change_state(mem, to_state, from_state_req,
+                                         online_type);
        mutex_unlock(&mem->state_mutex);
 
        return ret;
@@ -330,10 +341,18 @@ store_mem_state(struct device *dev,
 
        mem = container_of(dev, struct memory_block, dev);
 
-       if (!strncmp(buf, "online", min((int)count, 6)))
-               ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
-       else if(!strncmp(buf, "offline", min((int)count, 7)))
-               ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+       if (!strncmp(buf, "online_kernel", min_t(int, count, 13)))
+               ret = memory_block_change_state(mem, MEM_ONLINE,
+                                               MEM_OFFLINE, ONLINE_KERNEL);
+       else if (!strncmp(buf, "online_movable", min_t(int, count, 14)))
+               ret = memory_block_change_state(mem, MEM_ONLINE,
+                                               MEM_OFFLINE, ONLINE_MOVABLE);
+       else if (!strncmp(buf, "online", min_t(int, count, 6)))
+               ret = memory_block_change_state(mem, MEM_ONLINE,
+                                               MEM_OFFLINE, ONLINE_KEEP);
+       else if(!strncmp(buf, "offline", min_t(int, count, 7)))
+               ret = memory_block_change_state(mem, MEM_OFFLINE,
+                                               MEM_ONLINE, -1);
 
        if (ret)
                return ret;
@@ -635,7 +654,6 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
                mem_remove_simple_file(mem, phys_device);
                mem_remove_simple_file(mem, removable);
                unregister_memory(mem);
-               kfree(mem);
        } else
                kobject_put(&mem->dev.kobj);
 
@@ -669,7 +687,7 @@ int offline_memory_block(struct memory_block *mem)
 
        mutex_lock(&mem->state_mutex);
        if (mem->state != MEM_OFFLINE)
-               ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+               ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1);
        mutex_unlock(&mem->state_mutex);
 
        return ret;
index af1a177..294e316 100644 (file)
@@ -252,6 +252,24 @@ static inline void hugetlb_register_node(struct node *node) {}
 static inline void hugetlb_unregister_node(struct node *node) {}
 #endif
 
+static void node_device_release(struct device *dev)
+{
+       struct node *node = to_node(dev);
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+       /*
+        * We schedule the work only when a memory section is
+        * onlined/offlined on this node. When we come here,
+        * all the memory on this node has been offlined,
+        * so we won't enqueue new work to this work.
+        *
+        * The work is using node->node_work, so we should
+        * flush work before freeing the memory.
+        */
+       flush_work(&node->node_work);
+#endif
+       kfree(node);
+}
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -259,12 +277,13 @@ static inline void hugetlb_unregister_node(struct node *node) {}
  *
  * Initialize and register the node device.
  */
-int register_node(struct node *node, int num, struct node *parent)
+static int register_node(struct node *node, int num, struct node *parent)
 {
        int error;
 
        node->dev.id = num;
        node->dev.bus = &node_subsys;
+       node->dev.release = node_device_release;
        error = device_register(&node->dev);
 
        if (!error){
@@ -306,7 +325,7 @@ void unregister_node(struct node *node)
        device_unregister(&node->dev);
 }
 
-struct node node_devices[MAX_NUMNODES];
+struct node *node_devices[MAX_NUMNODES];
 
 /*
  * register cpu under node
@@ -323,15 +342,15 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid)
        if (!obj)
                return 0;
 
-       ret = sysfs_create_link(&node_devices[nid].dev.kobj,
+       ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
                                &obj->kobj,
                                kobject_name(&obj->kobj));
        if (ret)
                return ret;
 
        return sysfs_create_link(&obj->kobj,
-                                &node_devices[nid].dev.kobj,
-                                kobject_name(&node_devices[nid].dev.kobj));
+                                &node_devices[nid]->dev.kobj,
+                                kobject_name(&node_devices[nid]->dev.kobj));
 }
 
 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
@@ -345,10 +364,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
        if (!obj)
                return 0;
 
-       sysfs_remove_link(&node_devices[nid].dev.kobj,
+       sysfs_remove_link(&node_devices[nid]->dev.kobj,
                          kobject_name(&obj->kobj));
        sysfs_remove_link(&obj->kobj,
-                         kobject_name(&node_devices[nid].dev.kobj));
+                         kobject_name(&node_devices[nid]->dev.kobj));
 
        return 0;
 }
@@ -390,15 +409,15 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
                        continue;
                if (page_nid != nid)
                        continue;
-               ret = sysfs_create_link_nowarn(&node_devices[nid].dev.kobj,
+               ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
                                        &mem_blk->dev.kobj,
                                        kobject_name(&mem_blk->dev.kobj));
                if (ret)
                        return ret;
 
                return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
-                               &node_devices[nid].dev.kobj,
-                               kobject_name(&node_devices[nid].dev.kobj));
+                               &node_devices[nid]->dev.kobj,
+                               kobject_name(&node_devices[nid]->dev.kobj));
        }
        /* mem section does not span the specified node */
        return 0;
@@ -431,10 +450,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
                        continue;
                if (node_test_and_set(nid, *unlinked_nodes))
                        continue;
-               sysfs_remove_link(&node_devices[nid].dev.kobj,
+               sysfs_remove_link(&node_devices[nid]->dev.kobj,
                         kobject_name(&mem_blk->dev.kobj));
                sysfs_remove_link(&mem_blk->dev.kobj,
-                        kobject_name(&node_devices[nid].dev.kobj));
+                        kobject_name(&node_devices[nid]->dev.kobj));
        }
        NODEMASK_FREE(unlinked_nodes);
        return 0;
@@ -500,7 +519,7 @@ static void node_hugetlb_work(struct work_struct *work)
 
 static void init_node_hugetlb_work(int nid)
 {
-       INIT_WORK(&node_devices[nid].node_work, node_hugetlb_work);
+       INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
 }
 
 static int node_memory_callback(struct notifier_block *self,
@@ -517,7 +536,7 @@ static int node_memory_callback(struct notifier_block *self,
                 * when transitioning to/from memoryless state.
                 */
                if (nid != NUMA_NO_NODE)
-                       schedule_work(&node_devices[nid].node_work);
+                       schedule_work(&node_devices[nid]->node_work);
                break;
 
        case MEM_GOING_ONLINE:
@@ -558,9 +577,13 @@ int register_one_node(int nid)
                struct node *parent = NULL;
 
                if (p_node != nid)
-                       parent = &node_devices[p_node];
+                       parent = node_devices[p_node];
+
+               node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
+               if (!node_devices[nid])
+                       return -ENOMEM;
 
-               error = register_node(&node_devices[nid], nid, parent);
+               error = register_node(node_devices[nid], nid, parent);
 
                /* link cpu under this node */
                for_each_present_cpu(cpu) {
@@ -581,7 +604,8 @@ int register_one_node(int nid)
 
 void unregister_one_node(int nid)
 {
-       unregister_node(&node_devices[nid]);
+       unregister_node(node_devices[nid]);
+       node_devices[nid] = NULL;
 }
 
 /*
@@ -614,23 +638,23 @@ static ssize_t show_node_state(struct device *dev,
        { __ATTR(name, 0444, show_node_state, NULL), state }
 
 static struct node_attr node_state_attr[] = {
-       _NODE_ATTR(possible, N_POSSIBLE),
-       _NODE_ATTR(online, N_ONLINE),
-       _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
-       _NODE_ATTR(has_cpu, N_CPU),
+       [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
+       [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
+       [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
 #ifdef CONFIG_HIGHMEM
-       _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
+       [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
+       [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 };
 
 static struct attribute *node_state_attrs[] = {
-       &node_state_attr[0].attr.attr,
-       &node_state_attr[1].attr.attr,
-       &node_state_attr[2].attr.attr,
-       &node_state_attr[3].attr.attr,
+       &node_state_attr[N_POSSIBLE].attr.attr,
+       &node_state_attr[N_ONLINE].attr.attr,
+       &node_state_attr[N_NORMAL_MEMORY].attr.attr,
 #ifdef CONFIG_HIGHMEM
-       &node_state_attr[4].attr.attr,
+       &node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
+       &node_state_attr[N_CPU].attr.attr,
        NULL
 };
 
index 7d5a6b4..1963680 100644 (file)
@@ -565,7 +565,7 @@ fail_msg_node:
 fail_db_node:
        of_node_put(smu->db_node);
 fail_bootmem:
-       free_bootmem((unsigned long)smu, sizeof(struct smu_device));
+       free_bootmem(__pa(smu), sizeof(struct smu_device));
        smu = NULL;
 fail_np:
        of_node_put(np);
index b91e4bc..3b91b0f 100644 (file)
@@ -40,7 +40,7 @@
 #include <linux/notifier.h>
 
 static uint32_t lowmem_debug_level = 2;
-static int lowmem_adj[6] = {
+static short lowmem_adj[6] = {
        0,
        1,
        6,
@@ -70,9 +70,9 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
        int rem = 0;
        int tasksize;
        int i;
-       int min_score_adj = OOM_SCORE_ADJ_MAX + 1;
+       short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
        int selected_tasksize = 0;
-       int selected_oom_score_adj;
+       short selected_oom_score_adj;
        int array_size = ARRAY_SIZE(lowmem_adj);
        int other_free = global_page_state(NR_FREE_PAGES);
        int other_file = global_page_state(NR_FILE_PAGES) -
@@ -90,7 +90,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
                }
        }
        if (sc->nr_to_scan > 0)
-               lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %d\n",
+               lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
                                sc->nr_to_scan, sc->gfp_mask, other_free,
                                other_file, min_score_adj);
        rem = global_page_state(NR_ACTIVE_ANON) +
@@ -107,7 +107,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
        rcu_read_lock();
        for_each_process(tsk) {
                struct task_struct *p;
-               int oom_score_adj;
+               short oom_score_adj;
 
                if (tsk->flags & PF_KTHREAD)
                        continue;
@@ -141,11 +141,11 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
                selected = p;
                selected_tasksize = tasksize;
                selected_oom_score_adj = oom_score_adj;
-               lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n",
+               lowmem_print(2, "select %d (%s), adj %hd, size %d, to kill\n",
                             p->pid, p->comm, oom_score_adj, tasksize);
        }
        if (selected) {
-               lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n",
+               lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n",
                             selected->pid, selected->comm,
                             selected_oom_score_adj, selected_tasksize);
                lowmem_deathpending_timeout = jiffies + HZ;
@@ -176,7 +176,7 @@ static void __exit lowmem_exit(void)
 }
 
 module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
-module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size,
+module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
                         S_IRUGO | S_IWUSR);
 module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
                         S_IRUGO | S_IWUSR);
index 0908e60..2a70558 100644 (file)
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/balloon_compaction.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
  * multiple balloon pages.  All memory counters in this driver are in balloon
  * page units.
  */
-#define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
+#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
+#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
 
 struct virtio_balloon
 {
@@ -52,15 +54,19 @@ struct virtio_balloon
        /* Number of balloon pages we've told the Host we're not using. */
        unsigned int num_pages;
        /*
-        * The pages we've told the Host we're not using.
+        * The pages we've told the Host we're not using are enqueued
+        * at vb_dev_info->pages list.
         * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
         * to num_pages above.
         */
-       struct list_head pages;
+       struct balloon_dev_info *vb_dev_info;
+
+       /* Synchronize access/update to this struct virtio_balloon elements */
+       struct mutex balloon_lock;
 
        /* The array of pfns we tell the Host about. */
        unsigned int num_pfns;
-       u32 pfns[256];
+       u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
        /* Memory statistics */
        int need_stats_update;
@@ -122,18 +128,21 @@ static void set_page_pfns(u32 pfns[], struct page *page)
 
 static void fill_balloon(struct virtio_balloon *vb, size_t num)
 {
+       struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
+
        /* We can only do one array worth at a time. */
        num = min(num, ARRAY_SIZE(vb->pfns));
 
+       mutex_lock(&vb->balloon_lock);
        for (vb->num_pfns = 0; vb->num_pfns < num;
             vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
-                                       __GFP_NOMEMALLOC | __GFP_NOWARN);
+               struct page *page = balloon_page_enqueue(vb_dev_info);
+
                if (!page) {
                        if (printk_ratelimit())
                                dev_printk(KERN_INFO, &vb->vdev->dev,
-                                          "Out of puff! Can't get %zu pages\n",
-                                          num);
+                                          "Out of puff! Can't get %u pages\n",
+                                          VIRTIO_BALLOON_PAGES_PER_PAGE);
                        /* Sleep for at least 1/5 of a second before retry. */
                        msleep(200);
                        break;
@@ -141,14 +150,12 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
                set_page_pfns(vb->pfns + vb->num_pfns, page);
                vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
                totalram_pages--;
-               list_add(&page->lru, &vb->pages);
        }
 
-       /* Didn't get any?  Oh well. */
-       if (vb->num_pfns == 0)
-               return;
-
-       tell_host(vb, vb->inflate_vq);
+       /* Did we get any? */
+       if (vb->num_pfns != 0)
+               tell_host(vb, vb->inflate_vq);
+       mutex_unlock(&vb->balloon_lock);
 }
 
 static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
@@ -157,7 +164,7 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
 
        /* Find pfns pointing at start of each page, get pages and free them. */
        for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               __free_page(balloon_pfn_to_page(pfns[i]));
+               balloon_page_free(balloon_pfn_to_page(pfns[i]));
                totalram_pages++;
        }
 }
@@ -165,14 +172,17 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
 static void leak_balloon(struct virtio_balloon *vb, size_t num)
 {
        struct page *page;
+       struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
 
        /* We can only do one array worth at a time. */
        num = min(num, ARRAY_SIZE(vb->pfns));
 
+       mutex_lock(&vb->balloon_lock);
        for (vb->num_pfns = 0; vb->num_pfns < num;
             vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               page = list_first_entry(&vb->pages, struct page, lru);
-               list_del(&page->lru);
+               page = balloon_page_dequeue(vb_dev_info);
+               if (!page)
+                       break;
                set_page_pfns(vb->pfns + vb->num_pfns, page);
                vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
        }
@@ -183,6 +193,7 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
         * is true, we *have* to do it in this order
         */
        tell_host(vb, vb->deflate_vq);
+       mutex_unlock(&vb->balloon_lock);
        release_pages_by_pfn(vb->pfns, vb->num_pfns);
 }
 
@@ -339,9 +350,84 @@ static int init_vqs(struct virtio_balloon *vb)
        return 0;
 }
 
+static const struct address_space_operations virtio_balloon_aops;
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * virtballoon_migratepage - perform the balloon page migration on behalf of
+ *                          a compation thread.     (called under page lock)
+ * @mapping: the page->mapping which will be assigned to the new migrated page.
+ * @newpage: page that will replace the isolated page after migration finishes.
+ * @page   : the isolated (old) page that is about to be migrated to newpage.
+ * @mode   : compaction mode -- not used for balloon page migration.
+ *
+ * After a ballooned page gets isolated by compaction procedures, this is the
+ * function that performs the page migration on behalf of a compaction thread
+ * The page migration for virtio balloon is done in a simple swap fashion which
+ * follows these two macro steps:
+ *  1) insert newpage into vb->pages list and update the host about it;
+ *  2) update the host about the old page removed from vb->pages list;
+ *
+ * This function preforms the balloon page migration task.
+ * Called through balloon_mapping->a_ops->migratepage
+ */
+int virtballoon_migratepage(struct address_space *mapping,
+               struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+       struct balloon_dev_info *vb_dev_info = balloon_page_device(page);
+       struct virtio_balloon *vb;
+       unsigned long flags;
+
+       BUG_ON(!vb_dev_info);
+
+       vb = vb_dev_info->balloon_device;
+
+       /*
+        * In order to avoid lock contention while migrating pages concurrently
+        * to leak_balloon() or fill_balloon() we just give up the balloon_lock
+        * this turn, as it is easier to retry the page migration later.
+        * This also prevents fill_balloon() getting stuck into a mutex
+        * recursion in the case it ends up triggering memory compaction
+        * while it is attempting to inflate the ballon.
+        */
+       if (!mutex_trylock(&vb->balloon_lock))
+               return -EAGAIN;
+
+       /* balloon's page migration 1st step  -- inflate "newpage" */
+       spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
+       balloon_page_insert(newpage, mapping, &vb_dev_info->pages);
+       vb_dev_info->isolated_pages--;
+       spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
+       vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+       set_page_pfns(vb->pfns, newpage);
+       tell_host(vb, vb->inflate_vq);
+
+       /*
+        * balloon's page migration 2nd step -- deflate "page"
+        *
+        * It's safe to delete page->lru here because this page is at
+        * an isolated migration list, and this step is expected to happen here
+        */
+       balloon_page_delete(page);
+       vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+       set_page_pfns(vb->pfns, page);
+       tell_host(vb, vb->deflate_vq);
+
+       mutex_unlock(&vb->balloon_lock);
+
+       return MIGRATEPAGE_BALLOON_SUCCESS;
+}
+
+/* define the balloon_mapping->a_ops callback to allow balloon page migration */
+static const struct address_space_operations virtio_balloon_aops = {
+                       .migratepage = virtballoon_migratepage,
+};
+#endif /* CONFIG_BALLOON_COMPACTION */
+
 static int virtballoon_probe(struct virtio_device *vdev)
 {
        struct virtio_balloon *vb;
+       struct address_space *vb_mapping;
+       struct balloon_dev_info *vb_devinfo;
        int err;
 
        vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -350,16 +436,37 @@ static int virtballoon_probe(struct virtio_device *vdev)
                goto out;
        }
 
-       INIT_LIST_HEAD(&vb->pages);
        vb->num_pages = 0;
+       mutex_init(&vb->balloon_lock);
        init_waitqueue_head(&vb->config_change);
        init_waitqueue_head(&vb->acked);
        vb->vdev = vdev;
        vb->need_stats_update = 0;
 
+       vb_devinfo = balloon_devinfo_alloc(vb);
+       if (IS_ERR(vb_devinfo)) {
+               err = PTR_ERR(vb_devinfo);
+               goto out_free_vb;
+       }
+
+       vb_mapping = balloon_mapping_alloc(vb_devinfo,
+                                          (balloon_compaction_check()) ?
+                                          &virtio_balloon_aops : NULL);
+       if (IS_ERR(vb_mapping)) {
+               /*
+                * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP
+                * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off.
+                */
+               err = PTR_ERR(vb_mapping);
+               if (err != -EOPNOTSUPP)
+                       goto out_free_vb_devinfo;
+       }
+
+       vb->vb_dev_info = vb_devinfo;
+
        err = init_vqs(vb);
        if (err)
-               goto out_free_vb;
+               goto out_free_vb_mapping;
 
        vb->thread = kthread_run(balloon, vb, "vballoon");
        if (IS_ERR(vb->thread)) {
@@ -371,6 +478,10 @@ static int virtballoon_probe(struct virtio_device *vdev)
 
 out_del_vqs:
        vdev->config->del_vqs(vdev);
+out_free_vb_mapping:
+       balloon_mapping_free(vb_mapping);
+out_free_vb_devinfo:
+       balloon_devinfo_free(vb_devinfo);
 out_free_vb:
        kfree(vb);
 out:
@@ -396,6 +507,8 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev)
 
        kthread_stop(vb->thread);
        remove_common(vb);
+       balloon_mapping_free(vb->vb_dev_info->mapping);
+       balloon_devinfo_free(vb->vb_dev_info);
        kfree(vb);
 }
 
index 7cda519..22a0439 100644 (file)
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        num_dirty = root->fs_info->dirty_metadata_bytes;
 
        if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited_nr(
-                                  root->fs_info->btree_inode->i_mapping, 1);
+               balance_dirty_pages_ratelimited(
+                                  root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        num_dirty = root->fs_info->dirty_metadata_bytes;
 
        if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited_nr(
-                                  root->fs_info->btree_inode->i_mapping, 1);
+               balance_dirty_pages_ratelimited(
+                                  root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
index 9ab1bed..a8ee75c 100644 (file)
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
                cond_resched();
 
-               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                  dirty_pages);
+               balance_dirty_pages_ratelimited(inode->i_mapping);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root, 1);
 
index 8fcf9a5..5b3429a 100644 (file)
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                }
 
                defrag_count += ret;
-               balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+               balance_dirty_pages_ratelimited(inode->i_mapping);
                mutex_unlock(&inode->i_mutex);
 
                if (newer_than) {
index ec0aca8..6e9ed48 100644 (file)
@@ -555,7 +555,7 @@ void emergency_thaw_all(void)
  */
 int sync_mapping_buffers(struct address_space *mapping)
 {
-       struct address_space *buffer_mapping = mapping->assoc_mapping;
+       struct address_space *buffer_mapping = mapping->private_data;
 
        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
                return 0;
@@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
        struct address_space *buffer_mapping = bh->b_page->mapping;
 
        mark_buffer_dirty(bh);
-       if (!mapping->assoc_mapping) {
-               mapping->assoc_mapping = buffer_mapping;
+       if (!mapping->private_data) {
+               mapping->private_data = buffer_mapping;
        } else {
-               BUG_ON(mapping->assoc_mapping != buffer_mapping);
+               BUG_ON(mapping->private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-               struct address_space *buffer_mapping = mapping->assoc_mapping;
+               struct address_space *buffer_mapping = mapping->private_data;
 
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list))
@@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-               struct address_space *buffer_mapping = mapping->assoc_mapping;
+               struct address_space *buffer_mapping = mapping->private_data;
 
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list)) {
index e6c2fd5..0f22d09 100644 (file)
@@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->host = s->s_bdev->bd_inode;
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
-               mapping->assoc_mapping = NULL;
+               mapping->private_data = NULL;
                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
index c5bc355..4a55f35 100644 (file)
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-       unsigned long start_addr;
        struct hstate *h = hstate_file(file);
+       struct vm_unmapped_area_info info;
 
        if (len & ~huge_page_mask(h))
                return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                        return addr;
        }
 
-       if (len > mm->cached_hole_size)
-               start_addr = mm->free_area_cache;
-       else {
-               start_addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
-
-full_search:
-       addr = ALIGN(start_addr, huge_page_size(h));
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-
-               if (!vma || addr + len <= vma->vm_start) {
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = ALIGN(vma->vm_end, huge_page_size(h));
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       return vm_unmapped_area(&info);
 }
 #endif
 
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
        int rc;
 
        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
-       if (rc)
+       if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
        migrate_page_copy(newpage, page);
 
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
 }
 
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
        .kill_sb        = kill_litter_super,
 };
 
-static struct vfsmount *hugetlbfs_vfsmount;
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 
 static int can_do_hugetlb_shm(void)
 {
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
 }
 
+static int get_hstate_idx(int page_size_log)
+{
+       struct hstate *h;
+
+       if (!page_size_log)
+               return default_hstate_idx;
+       h = size_to_hstate(1 << page_size_log);
+       if (!h)
+               return -1;
+       return h - hstates;
+}
+
 struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                                size_t size, vm_flags_t acctflag,
-                               struct user_struct **user, int creat_flags)
+                               struct user_struct **user,
+                               int creat_flags, int page_size_log)
 {
        int error = -ENOMEM;
        struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        struct qstr quick_string;
        struct hstate *hstate;
        unsigned long num_pages;
+       int hstate_idx;
+
+       hstate_idx = get_hstate_idx(page_size_log);
+       if (hstate_idx < 0)
+               return ERR_PTR(-ENODEV);
 
        *user = NULL;
-       if (!hugetlbfs_vfsmount)
+       if (!hugetlbfs_vfsmount[hstate_idx])
                return ERR_PTR(-ENOENT);
 
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                }
        }
 
-       root = hugetlbfs_vfsmount->mnt_root;
+       root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
        quick_string.name = name;
        quick_string.len = strlen(quick_string.name);
        quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        if (!path.dentry)
                goto out_shm_unlock;
 
-       path.mnt = mntget(hugetlbfs_vfsmount);
+       path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
 
 static int __init init_hugetlbfs_fs(void)
 {
+       struct hstate *h;
        int error;
-       struct vfsmount *vfsmount;
+       int i;
 
        error = bdi_init(&hugetlbfs_backing_dev_info);
        if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
        if (error)
                goto out;
 
-       vfsmount = kern_mount(&hugetlbfs_fs_type);
+       i = 0;
+       for_each_hstate(h) {
+               char buf[50];
+               unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
 
-       if (!IS_ERR(vfsmount)) {
-               hugetlbfs_vfsmount = vfsmount;
-               return 0;
-       }
+               snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
+               hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
+                                                       buf);
 
-       error = PTR_ERR(vfsmount);
+               if (IS_ERR(hugetlbfs_vfsmount[i])) {
+                       pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+                               "page size %uK", ps_kb);
+                       error = PTR_ERR(hugetlbfs_vfsmount[i]);
+                       hugetlbfs_vfsmount[i] = NULL;
+               }
+               i++;
+       }
+       /* Non default hstates are optional */
+       if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
+               return 0;
 
  out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
 
 static void __exit exit_hugetlbfs_fs(void)
 {
+       struct hstate *h;
+       int i;
+
+
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(hugetlbfs_inode_cachep);
-       kern_unmount(hugetlbfs_vfsmount);
+       i = 0;
+       for_each_hstate(h)
+               kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
index 64999f1..14084b7 100644 (file)
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
-       mapping->assoc_mapping = NULL;
+       mapping->private_data = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
 
index 3e7b2a0..07f76db 100644 (file)
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
-       mapping->assoc_mapping = NULL;
+       mapping->private_data = NULL;
        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
index 5a4ee77..dda0898 100644 (file)
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                ret = sd.num_spliced;
 
        if (ret > 0) {
-               unsigned long nr_pages;
                int err;
 
-               nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
 
-               balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+               balance_dirty_pages_ratelimited(mapping);
        }
 
        return ret;
index 9e28356..aa63d25 100644 (file)
@@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        char buffer[PROC_NUMBUF];
-       int oom_score_adj = OOM_SCORE_ADJ_MIN;
+       short oom_score_adj = OOM_SCORE_ADJ_MIN;
        unsigned long flags;
        size_t len;
 
@@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                unlock_task_sighand(task, &flags);
        }
        put_task_struct(task);
-       len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+       len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
@@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
 
-       if (oom_score_adj < task->signal->oom_score_adj_min &&
+       if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
        }
 
-       task->signal->oom_score_adj = oom_score_adj;
+       task->signal->oom_score_adj = (short)oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
-               task->signal->oom_score_adj_min = oom_score_adj;
+               task->signal->oom_score_adj_min = (short)oom_score_adj;
        trace_oom_score_adj_update(task);
 
 err_sighand:
index 13e5b47..8890604 100644 (file)
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                ret = sd.num_spliced;
 
        if (ret > 0) {
-               unsigned long nr_pages;
                int err;
 
-               nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
-               balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+               balance_dirty_pages_ratelimited(mapping);
        }
        sb_end_write(inode->i_sb);
 
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
new file mode 100644 (file)
index 0000000..f7f1d71
--- /dev/null
@@ -0,0 +1,272 @@
+/*
+ * include/linux/balloon_compaction.h
+ *
+ * Common interface definitions for making balloon pages movable by compaction.
+ *
+ * Despite being perfectly possible to perform ballooned pages migration, they
+ * make a special corner case to compaction scans because balloon pages are not
+ * enlisted at any LRU list like the other pages we do compact / migrate.
+ *
+ * As the page isolation scanning step a compaction thread does is a lockless
+ * procedure (from a page standpoint), it might bring some racy situations while
+ * performing balloon page compaction. In order to sort out these racy scenarios
+ * and safely perform balloon's page compaction and migration we must, always,
+ * ensure following these three simple rules:
+ *
+ *   i. when updating a balloon's page ->mapping element, strictly do it under
+ *      the following lock order, independently of the far superior
+ *      locking scheme (lru_lock, balloon_lock):
+ *         +-page_lock(page);
+ *           +--spin_lock_irq(&b_dev_info->pages_lock);
+ *                 ... page->mapping updates here ...
+ *
+ *  ii. before isolating or dequeueing a balloon page from the balloon device
+ *      pages list, the page reference counter must be raised by one and the
+ *      extra refcount must be dropped when the page is enqueued back into
+ *      the balloon device page list, thus a balloon page keeps its reference
+ *      counter raised only while it is under our special handling;
+ *
+ * iii. after the lockless scan step have selected a potential balloon page for
+ *      isolation, re-test the page->mapping flags and the page ref counter
+ *      under the proper page lock, to ensure isolating a valid balloon page
+ *      (not yet isolated, nor under release procedure)
+ *
+ * The functions provided by this interface are placed to help on coping with
+ * the aforementioned balloon page corner case, as well as to ensure the simple
+ * set of exposed rules are satisfied while we are dealing with balloon pages
+ * compaction / migration.
+ *
+ * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini <aquini@redhat.com>
+ */
+#ifndef _LINUX_BALLOON_COMPACTION_H
+#define _LINUX_BALLOON_COMPACTION_H
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/gfp.h>
+#include <linux/err.h>
+
+/*
+ * Balloon device information descriptor.
+ * This struct is used to allow the common balloon compaction interface
+ * procedures to find the proper balloon device holding memory pages they'll
+ * have to cope for page compaction / migration, as well as it serves the
+ * balloon driver as a page book-keeper for its registered balloon devices.
+ */
+struct balloon_dev_info {
+       void *balloon_device;           /* balloon device descriptor */
+       struct address_space *mapping;  /* balloon special page->mapping */
+       unsigned long isolated_pages;   /* # of isolated pages for migration */
+       spinlock_t pages_lock;          /* Protection to pages list */
+       struct list_head pages;         /* Pages enqueued & handled to Host */
+};
+
+extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
+extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
+extern struct balloon_dev_info *balloon_devinfo_alloc(
+                                               void *balloon_dev_descriptor);
+
+static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info)
+{
+       kfree(b_dev_info);
+}
+
+/*
+ * balloon_page_free - release a balloon page back to the page free lists
+ * @page: ballooned page to be set free
+ *
+ * This function must be used to properly set free an isolated/dequeued balloon
+ * page at the end of a sucessful page migration, or at the balloon driver's
+ * page release procedure.
+ */
+static inline void balloon_page_free(struct page *page)
+{
+       /*
+        * Balloon pages always get an extra refcount before being isolated
+        * and before being dequeued to help on sorting out fortuite colisions
+        * between a thread attempting to isolate and another thread attempting
+        * to release the very same balloon page.
+        *
+        * Before we handle the page back to Buddy, lets drop its extra refcnt.
+        */
+       put_page(page);
+       __free_page(page);
+}
+
+#ifdef CONFIG_BALLOON_COMPACTION
+extern bool balloon_page_isolate(struct page *page);
+extern void balloon_page_putback(struct page *page);
+extern int balloon_page_migrate(struct page *newpage,
+                               struct page *page, enum migrate_mode mode);
+extern struct address_space
+*balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+                       const struct address_space_operations *a_ops);
+
+static inline void balloon_mapping_free(struct address_space *balloon_mapping)
+{
+       kfree(balloon_mapping);
+}
+
+/*
+ * page_flags_cleared - helper to perform balloon @page ->flags tests.
+ *
+ * As balloon pages are obtained from buddy and we do not play with page->flags
+ * at driver level (exception made when we get the page lock for compaction),
+ * we can safely identify a ballooned page by checking if the
+ * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared.  This approach also
+ * helps us skip ballooned pages that are locked for compaction or release, thus
+ * mitigating their racy check at balloon_page_movable()
+ */
+static inline bool page_flags_cleared(struct page *page)
+{
+       return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
+}
+
+/*
+ * __is_movable_balloon_page - helper to perform @page mapping->flags tests
+ */
+static inline bool __is_movable_balloon_page(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       return mapping_balloon(mapping);
+}
+
+/*
+ * balloon_page_movable - test page->mapping->flags to identify balloon pages
+ *                       that can be moved by compaction/migration.
+ *
+ * This function is used at core compaction's page isolation scheme, therefore
+ * most pages exposed to it are not enlisted as balloon pages and so, to avoid
+ * undesired side effects like racing against __free_pages(), we cannot afford
+ * holding the page locked while testing page->mapping->flags here.
+ *
+ * As we might return false positives in the case of a balloon page being just
+ * released under us, the page->mapping->flags need to be re-tested later,
+ * under the proper page lock, at the functions that will be coping with the
+ * balloon page case.
+ */
+static inline bool balloon_page_movable(struct page *page)
+{
+       /*
+        * Before dereferencing and testing mapping->flags, let's make sure
+        * this is not a page that uses ->mapping in a different way
+        */
+       if (page_flags_cleared(page) && !page_mapped(page) &&
+           page_count(page) == 1)
+               return __is_movable_balloon_page(page);
+
+       return false;
+}
+
+/*
+ * balloon_page_insert - insert a page into the balloon's page list and make
+ *                      the page->mapping assignment accordingly.
+ * @page    : page to be assigned as a 'balloon page'
+ * @mapping : allocated special 'balloon_mapping'
+ * @head    : balloon's device page list head
+ *
+ * Caller must ensure the page is locked and the spin_lock protecting balloon
+ * pages list is held before inserting a page into the balloon device.
+ */
+static inline void balloon_page_insert(struct page *page,
+                                      struct address_space *mapping,
+                                      struct list_head *head)
+{
+       page->mapping = mapping;
+       list_add(&page->lru, head);
+}
+
+/*
+ * balloon_page_delete - delete a page from balloon's page list and clear
+ *                      the page->mapping assignement accordingly.
+ * @page    : page to be released from balloon's page list
+ *
+ * Caller must ensure the page is locked and the spin_lock protecting balloon
+ * pages list is held before deleting a page from the balloon device.
+ */
+static inline void balloon_page_delete(struct page *page)
+{
+       page->mapping = NULL;
+       list_del(&page->lru);
+}
+
+/*
+ * balloon_page_device - get the b_dev_info descriptor for the balloon device
+ *                      that enqueues the given page.
+ */
+static inline struct balloon_dev_info *balloon_page_device(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       if (likely(mapping))
+               return mapping->private_data;
+
+       return NULL;
+}
+
+static inline gfp_t balloon_mapping_gfp_mask(void)
+{
+       return GFP_HIGHUSER_MOVABLE;
+}
+
+static inline bool balloon_compaction_check(void)
+{
+       return true;
+}
+
+#else /* !CONFIG_BALLOON_COMPACTION */
+
+static inline void *balloon_mapping_alloc(void *balloon_device,
+                               const struct address_space_operations *a_ops)
+{
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void balloon_mapping_free(struct address_space *balloon_mapping)
+{
+       return;
+}
+
+static inline void balloon_page_insert(struct page *page,
+                                      struct address_space *mapping,
+                                      struct list_head *head)
+{
+       list_add(&page->lru, head);
+}
+
+static inline void balloon_page_delete(struct page *page)
+{
+       list_del(&page->lru);
+}
+
+static inline bool balloon_page_movable(struct page *page)
+{
+       return false;
+}
+
+static inline bool balloon_page_isolate(struct page *page)
+{
+       return false;
+}
+
+static inline void balloon_page_putback(struct page *page)
+{
+       return;
+}
+
+static inline int balloon_page_migrate(struct page *newpage,
+                               struct page *page, enum migrate_mode mode)
+{
+       return 0;
+}
+
+static inline gfp_t balloon_mapping_gfp_mask(void)
+{
+       return GFP_HIGHUSER;
+}
+
+static inline bool balloon_compaction_check(void)
+{
+       return false;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
+#endif /* _LINUX_BALLOON_COMPACTION_H */
index 6d6795d..7b74452 100644 (file)
@@ -51,8 +51,8 @@ extern unsigned long free_all_bootmem(void);
 extern void free_bootmem_node(pg_data_t *pgdat,
                              unsigned long addr,
                              unsigned long size);
-extern void free_bootmem(unsigned long addr, unsigned long size);
-extern void free_bootmem_late(unsigned long addr, unsigned long size);
+extern void free_bootmem(unsigned long physaddr, unsigned long size);
+extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
index 75fe9a1..408fb1e 100644 (file)
@@ -418,7 +418,7 @@ struct address_space {
        struct backing_dev_info *backing_dev_info; /* device readahead, etc */
        spinlock_t              private_lock;   /* for use by the address_space */
        struct list_head        private_list;   /* ditto */
-       struct address_space    *assoc_mapping; /* ditto */
+       void                    *private_data;  /* ditto */
 } __attribute__((aligned(sizeof(long))));
        /*
         * On most architectures that alignment is already the case; but
index d0a7967..31e8041 100644 (file)
@@ -266,7 +266,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 
 static inline int gfp_zonelist(gfp_t flags)
 {
-       if (NUMA_BUILD && unlikely(flags & __GFP_THISNODE))
+       if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE))
                return 1;
 
        return 0;
index b31cb7d..1af4775 100644 (file)
@@ -8,6 +8,10 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                         struct vm_area_struct *vma);
+extern void huge_pmd_set_accessed(struct mm_struct *mm,
+                                 struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmd,
+                                 pmd_t orig_pmd, int dirty);
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               pmd_t orig_pmd);
index 2251648..3e7fa1a 100644 (file)
@@ -183,7 +183,8 @@ extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                                size_t size, vm_flags_t acct,
-                               struct user_struct **user, int creat_flags);
+                               struct user_struct **user, int creat_flags,
+                               int page_size_log);
 
 static inline int is_file_hugepages(struct file *file)
 {
@@ -195,12 +196,14 @@ static inline int is_file_hugepages(struct file *file)
        return 0;
 }
 
+
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)                        0
 static inline struct file *
 hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
-               vm_flags_t acctflag, struct user_struct **user, int creat_flags)
+               vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+               int page_size_log)
 {
        return ERR_PTR(-ENOSYS);
 }
index 7d8dfc7..dd9900c 100644 (file)
@@ -687,20 +687,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
-/* This helps us to avoid #ifdef CONFIG_NUMA */
-#ifdef CONFIG_NUMA
-#define NUMA_BUILD 1
-#else
-#define NUMA_BUILD 0
-#endif
-
-/* This helps us avoid #ifdef CONFIG_COMPACTION */
-#ifdef CONFIG_COMPACTION
-#define COMPACTION_BUILD 1
-#else
-#define COMPACTION_BUILD 0
-#endif
-
 /* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */
 #ifdef CONFIG_SYMBOL_PREFIX
 #define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
index ff9a9f8..a09216d 100644 (file)
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
 struct memory_notify {
        unsigned long start_pfn;
        unsigned long nr_pages;
+       int status_change_nid_normal;
        int status_change_nid;
 };
 
index 95573ec..4a45c4e 100644 (file)
@@ -26,6 +26,13 @@ enum {
        MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
 };
 
+/* Types for control the zone type of onlined memory */
+enum {
+       ONLINE_KEEP,
+       ONLINE_KERNEL,
+       ONLINE_MOVABLE,
+};
+
 /*
  * pgdat resizing functions
  */
@@ -46,6 +53,10 @@ void pgdat_resize_init(struct pglist_data *pgdat)
 }
 /*
  * Zone resizing functions
+ *
+ * Note: any attempt to resize a zone should has pgdat_resize_lock()
+ * zone_span_writelock() both held. This ensure the size of a zone
+ * can't be changed while pgdat_resize_lock() held.
  */
 static inline unsigned zone_span_seqbegin(struct zone *zone)
 {
@@ -71,7 +82,7 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 /* VM interface that may be used by firmware interface */
-extern int online_pages(unsigned long, unsigned long);
+extern int online_pages(unsigned long, unsigned long, int);
 extern void __offline_isolated_pages(unsigned long, unsigned long);
 
 typedef void (*online_page_callback_t)(struct page *page);
index ce7e667..0b5865c 100644 (file)
@@ -7,9 +7,27 @@
 
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
+/*
+ * Return values from addresss_space_operations.migratepage():
+ * - negative errno on page migration failure;
+ * - zero on page migration success;
+ *
+ * The balloon page migration introduces this special case where a 'distinct'
+ * return code is used to flag a successful page migration to unmap_and_move().
+ * This approach is necessary because page migration can race against balloon
+ * deflation procedure, and for such case we could introduce a nasty page leak
+ * if a successfully migrated balloon page gets released concurrently with
+ * migration's unmap_and_move() wrap-up steps.
+ */
+#define MIGRATEPAGE_SUCCESS            0
+#define MIGRATEPAGE_BALLOON_SUCCESS    1 /* special ret code for balloon page
+                                          * sucessful migration case.
+                                          */
+
 #ifdef CONFIG_MIGRATION
 
 extern void putback_lru_pages(struct list_head *l);
+extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
                        struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
@@ -33,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_lru_pages(struct list_head *l) {}
+static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
                unsigned long private, bool offlining,
                enum migrate_mode mode) { return -ENOSYS; }
index bcaab4e..4af4f0b 100644 (file)
@@ -1456,6 +1456,37 @@ extern unsigned long vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);
 
+struct vm_unmapped_area_info {
+#define VM_UNMAPPED_AREA_TOPDOWN 1
+       unsigned long flags;
+       unsigned long length;
+       unsigned long low_limit;
+       unsigned long high_limit;
+       unsigned long align_mask;
+       unsigned long align_offset;
+};
+
+extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
+extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
+
+/*
+ * Search for an unmapped address range.
+ *
+ * We are looking for a range that:
+ * - does not intersect with any VMA;
+ * - is contained within the [low_limit, high_limit) interval;
+ * - is at least the desired size.
+ * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
+ */
+static inline unsigned long
+vm_unmapped_area(struct vm_unmapped_area_info *info)
+{
+       if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
+               return unmapped_area(info);
+       else
+               return unmapped_area_topdown(info);
+}
+
 /* truncate.c */
 extern void truncate_inode_pages(struct address_space *, loff_t);
 extern void truncate_inode_pages_range(struct address_space *,
index 31f8a3a..7ade273 100644 (file)
@@ -224,7 +224,8 @@ struct vm_region {
  * library, the executable area etc).
  */
 struct vm_area_struct {
-       struct mm_struct * vm_mm;       /* The address space we belong to. */
+       /* The first cache line has the info for VMA tree walking. */
+
        unsigned long vm_start;         /* Our start address within vm_mm. */
        unsigned long vm_end;           /* The first byte after our end address
                                           within vm_mm. */
@@ -232,11 +233,22 @@ struct vm_area_struct {
        /* linked list of VM areas per task, sorted by address */
        struct vm_area_struct *vm_next, *vm_prev;
 
+       struct rb_node vm_rb;
+
+       /*
+        * Largest free memory gap in bytes to the left of this VMA.
+        * Either between this VMA and vma->vm_prev, or between one of the
+        * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
+        * get_unmapped_area find a free area of the right size.
+        */
+       unsigned long rb_subtree_gap;
+
+       /* Second cache line starts here. */
+
+       struct mm_struct *vm_mm;        /* The address space we belong to. */
        pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
        unsigned long vm_flags;         /* Flags, see mm.h. */
 
-       struct rb_node vm_rb;
-
        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree, or
@@ -322,6 +334,7 @@ struct mm_struct {
        unsigned long task_size;                /* size of task vm space */
        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
        unsigned long free_area_cache;          /* first hole of size cached_hole_size or larger */
+       unsigned long highest_vm_end;           /* highest vma end address */
        pgd_t * pgd;
        atomic_t mm_users;                      /* How many users with user space? */
        atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
index a23923b..0c0b1d6 100644 (file)
@@ -63,10 +63,8 @@ enum {
 
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
-#  define cma_wmark_pages(zone)        zone->min_cma_pages
 #else
 #  define is_migrate_cma(migratetype) false
-#  define cma_wmark_pages(zone) 0
 #endif
 
 #define for_each_migratetype_order(order, type) \
@@ -383,13 +381,6 @@ struct zone {
        /* see spanned/present_pages for more description */
        seqlock_t               span_seqlock;
 #endif
-#ifdef CONFIG_CMA
-       /*
-        * CMA needs to increase watermark levels during the allocation
-        * process to make sure that the system is not starved.
-        */
-       unsigned long           min_cma_pages;
-#endif
        struct free_area        free_area[MAX_ORDER];
 
 #ifndef CONFIG_SPARSEMEM
index 624e53c..2115ad5 100644 (file)
@@ -27,10 +27,9 @@ struct node {
 };
 
 struct memory_block;
-extern struct node node_devices[];
+extern struct node *node_devices[];
 typedef  void (*node_registration_func_t)(struct node *);
 
-extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
 #ifdef CONFIG_NUMA
 extern int register_one_node(int nid);
index fb98268..da60007 100644 (file)
@@ -29,8 +29,23 @@ enum oom_scan_t {
        OOM_SCAN_SELECT,        /* always select this thread first */
 };
 
-extern void compare_swap_oom_score_adj(int old_val, int new_val);
-extern int test_set_oom_score_adj(int new_val);
+/* Thread is the potential origin of an oom condition; kill first on oom */
+#define OOM_FLAG_ORIGIN                ((__force oom_flags_t)0x1)
+
+static inline void set_current_oom_origin(void)
+{
+       current->signal->oom_flags |= OOM_FLAG_ORIGIN;
+}
+
+static inline void clear_current_oom_origin(void)
+{
+       current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
+}
+
+static inline bool oom_task_origin(const struct task_struct *p)
+{
+       return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
+}
 
 extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -49,8 +64,6 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
                bool force_kill);
-extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                                    int order);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *mask, bool force_kill);
index 76a9539..a92061e 100644 (file)
@@ -2,7 +2,8 @@
 #define __LINUX_PAGEISOLATION_H
 
 
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+                        bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype);
@@ -21,7 +22,7 @@ int move_freepages(struct zone *zone,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                        unsigned migratetype);
+                        unsigned migratetype, bool skip_hwpoisoned_pages);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -34,12 +35,13 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
  */
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+                       bool skip_hwpoisoned_pages);
 
 /*
  * Internal functions. Changes pageblock's migrate type.
  */
-int set_migratetype_isolate(struct page *page);
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
 void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 struct page *alloc_migrate_target(struct page *page, unsigned long private,
                                int **resultp);
index e42c762..6da609d 100644 (file)
@@ -24,6 +24,7 @@ enum mapping_flags {
        AS_ENOSPC       = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */
        AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
        AS_UNEVICTABLE  = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
+       AS_BALLOON_MAP  = __GFP_BITS_SHIFT + 4, /* balloon page special map */
 };
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -53,6 +54,21 @@ static inline int mapping_unevictable(struct address_space *mapping)
        return !!mapping;
 }
 
+static inline void mapping_set_balloon(struct address_space *mapping)
+{
+       set_bit(AS_BALLOON_MAP, &mapping->flags);
+}
+
+static inline void mapping_clear_balloon(struct address_space *mapping)
+{
+       clear_bit(AS_BALLOON_MAP, &mapping->flags);
+}
+
+static inline int mapping_balloon(struct address_space *mapping)
+{
+       return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
        return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
index 0dd42a0..3e387df 100644 (file)
@@ -631,9 +631,10 @@ struct signal_struct {
        struct rw_semaphore group_rwsem;
 #endif
 
-       int oom_score_adj;      /* OOM kill score adjustment */
-       int oom_score_adj_min;  /* OOM kill score adjustment minimum value.
-                                * Only settable by CAP_SYS_RESOURCE. */
+       oom_flags_t oom_flags;
+       short oom_score_adj;            /* OOM kill score adjustment */
+       short oom_score_adj_min;        /* OOM kill score adjustment min value.
+                                        * Only settable by CAP_SYS_RESOURCE. */
 
        struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                         * credential calculations
index bcf8a6a..429c199 100644 (file)
@@ -29,6 +29,21 @@ struct shmid_kernel /* private to the kernel */
 #define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
 #define SHM_NORESERVE   010000  /* don't check for reservations */
 
+/* Bits [26:31] are reserved */
+
+/*
+ * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define SHM_HUGE_SHIFT  26
+#define SHM_HUGE_MASK   0x3f
+#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
+
 #ifdef CONFIG_SYSVIPC
 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
              unsigned long shmlba);
index 1cc0e4b..4d118ba 100644 (file)
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
 #endif
 typedef unsigned __bitwise__ gfp_t;
 typedef unsigned __bitwise__ fmode_t;
+typedef unsigned __bitwise__ oom_flags_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
index 50c3e8f..b82a83a 100644 (file)
@@ -161,14 +161,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
                            unsigned long start_time);
 
 void page_writeback_init(void);
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
-                                       unsigned long nr_pages_dirtied);
-
-static inline void
-balance_dirty_pages_ratelimited(struct address_space *mapping)
-{
-       balance_dirty_pages_ratelimited_nr(mapping, 1);
-}
+void balance_dirty_pages_ratelimited(struct address_space *mapping);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                void *data);
index dd4ba3b..1e97498 100644 (file)
@@ -14,7 +14,7 @@ TRACE_EVENT(oom_score_adj_update,
        TP_STRUCT__entry(
                __field(        pid_t,  pid)
                __array(        char,   comm,   TASK_COMM_LEN )
-               __field(         int,   oom_score_adj)
+               __field(        short,  oom_score_adj)
        ),
 
        TP_fast_assign(
@@ -23,7 +23,7 @@ TRACE_EVENT(oom_score_adj_update,
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),
 
-       TP_printk("pid=%d comm=%s oom_score_adj=%d",
+       TP_printk("pid=%d comm=%s oom_score_adj=%hd",
                __entry->pid, __entry->comm, __entry->oom_score_adj)
 );
 
index b53add0..102a646 100644 (file)
@@ -15,7 +15,7 @@ TRACE_EVENT(task_newtask,
                __field(        pid_t,  pid)
                __array(        char,   comm, TASK_COMM_LEN)
                __field( unsigned long, clone_flags)
-               __field(        int,    oom_score_adj)
+               __field(        short,  oom_score_adj)
        ),
 
        TP_fast_assign(
@@ -25,7 +25,7 @@ TRACE_EVENT(task_newtask,
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),
 
-       TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+       TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd",
                __entry->pid, __entry->comm,
                __entry->clone_flags, __entry->oom_score_adj)
 );
@@ -40,7 +40,7 @@ TRACE_EVENT(task_rename,
                __field(        pid_t,  pid)
                __array(        char, oldcomm,  TASK_COMM_LEN)
                __array(        char, newcomm,  TASK_COMM_LEN)
-               __field(        int, oom_score_adj)
+               __field(        short,  oom_score_adj)
        ),
 
        TP_fast_assign(
@@ -50,7 +50,7 @@ TRACE_EVENT(task_rename,
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),
 
-       TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+       TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd",
                __entry->pid, __entry->oldcomm,
                __entry->newcomm, __entry->oom_score_adj)
 );
index d030d2c..4164529 100644 (file)
 /* compatibility flags */
 #define MAP_FILE       0
 
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK  0x3f
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
index 32c8bd6..e9fe6fd 100644 (file)
@@ -13,6 +13,8 @@
 #define MAP_STACK      0x20000         /* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB    0x40000         /* create a huge page mapping */
 
+/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
+
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
 
index dff40c9..4fa6d8f 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -495,7 +495,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
                if (shmflg & SHM_NORESERVE)
                        acctflag = VM_NORESERVE;
                file = hugetlb_file_setup(name, 0, size, acctflag,
-                                       &shp->mlock_user, HUGETLB_SHMFS_INODE);
+                                 &shp->mlock_user, HUGETLB_SHMFS_INODE,
+                               (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
        } else {
                /*
                 * Do not allow no accounting for OVERCOMMIT_NEVER, even
index 402a54a..d327b87 100644 (file)
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var);
  */
 void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
-       free_bootmem((unsigned long)mask, cpumask_size());
+       free_bootmem(__pa(mask), cpumask_size());
 }
 #endif
index a3f8ddd..e6651c5 100644 (file)
@@ -188,6 +188,21 @@ config SPLIT_PTLOCK_CPUS
        default "4"
 
 #
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+       bool "Allow for balloon memory compaction/migration"
+       def_bool y
+       depends on COMPACTION && VIRTIO_BALLOON
+       help
+         Memory fragmentation introduced by ballooning might reduce
+         significantly the number of 2MB contiguous memory blocks that can be
+         used within a guest, thus imposing performance penalties associated
+         with the reduced number of transparent huge pages that could be used
+         by the guest workload. Allowing the compaction & migration for memory
+         pages enlisted as being part of memory balloon devices avoids the
+         scenario aforementioned and helps improving memory defragmentation.
+
+#
 # support for memory compaction
 config COMPACTION
        bool "Allow for memory compaction"
index 6b025f8..3a46287 100644 (file)
@@ -16,7 +16,8 @@ obj-y                 := filemap.o mempool.o oom_kill.o fadvise.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                          compaction.o interval_tree.o $(mmu-y)
+                          compaction.o balloon_compaction.o \
+                          interval_tree.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644 (file)
index 0000000..07dbc8e
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ *                          this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+       struct balloon_dev_info *b_dev_info;
+       b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+       if (!b_dev_info)
+               return ERR_PTR(-ENOMEM);
+
+       b_dev_info->balloon_device = balloon_dev_descriptor;
+       b_dev_info->mapping = NULL;
+       b_dev_info->isolated_pages = 0;
+       spin_lock_init(&b_dev_info->pages_lock);
+       INIT_LIST_HEAD(&b_dev_info->pages);
+
+       return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ *                       page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+       unsigned long flags;
+       struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+                                       __GFP_NOMEMALLOC | __GFP_NORETRY);
+       if (!page)
+               return NULL;
+
+       /*
+        * Block others from accessing the 'page' when we get around to
+        * establishing additional references. We should be the only one
+        * holding a reference to the 'page' at this point.
+        */
+       BUG_ON(!trylock_page(page));
+       spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+       balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+       spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+       unlock_page(page);
+       return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ *                       the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+       struct page *page, *tmp;
+       unsigned long flags;
+       bool dequeued_page;
+
+       dequeued_page = false;
+       list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+               /*
+                * Block others from accessing the 'page' while we get around
+                * establishing additional references and preparing the 'page'
+                * to be released by the balloon driver.
+                */
+               if (trylock_page(page)) {
+                       spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+                       /*
+                        * Raise the page refcount here to prevent any wrong
+                        * attempt to isolate this page, in case of coliding
+                        * with balloon_page_isolate() just after we release
+                        * the page lock.
+                        *
+                        * balloon_page_free() will take care of dropping
+                        * this extra refcount later.
+                        */
+                       get_page(page);
+                       balloon_page_delete(page);
+                       spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+                       unlock_page(page);
+                       dequeued_page = true;
+                       break;
+               }
+       }
+
+       if (!dequeued_page) {
+               /*
+                * If we are unable to dequeue a balloon page because the page
+                * list is empty and there is no isolated pages, then something
+                * went out of track and some balloon pages are lost.
+                * BUG() here, otherwise the balloon driver may get stuck into
+                * an infinite loop while attempting to release all its pages.
+                */
+               spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+               if (unlikely(list_empty(&b_dev_info->pages) &&
+                            !b_dev_info->isolated_pages))
+                       BUG();
+               spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+               page = NULL;
+       }
+       return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+                               const struct address_space_operations *a_ops)
+{
+       struct address_space *mapping;
+
+       mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+       if (!mapping)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Give a clean 'zeroed' status to all elements of this special
+        * balloon page->mapping struct address_space instance.
+        */
+       address_space_init_once(mapping);
+
+       /*
+        * Set mapping->flags appropriately, to allow balloon pages
+        * ->mapping identification.
+        */
+       mapping_set_balloon(mapping);
+       mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+
+       /* balloon's page->mapping->a_ops callback descriptor */
+       mapping->a_ops = a_ops;
+
+       /*
+        * Establish a pointer reference back to the balloon device descriptor
+        * this particular page->mapping will be servicing.
+        * This is used by compaction / migration procedures to identify and
+        * access the balloon device pageset while isolating / migrating pages.
+        *
+        * As some balloon drivers can register multiple balloon devices
+        * for a single guest, this also helps compaction / migration to
+        * properly deal with multiple balloon pagesets, when required.
+        */
+       mapping->private_data = b_dev_info;
+       b_dev_info->mapping = mapping;
+
+       return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+
+static inline void __isolate_balloon_page(struct page *page)
+{
+       struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+       unsigned long flags;
+       spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+       list_del(&page->lru);
+       b_dev_info->isolated_pages++;
+       spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline void __putback_balloon_page(struct page *page)
+{
+       struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+       unsigned long flags;
+       spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+       list_add(&page->lru, &b_dev_info->pages);
+       b_dev_info->isolated_pages--;
+       spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline int __migrate_balloon_page(struct address_space *mapping,
+               struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+       return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+       /*
+        * Avoid burning cycles with pages that are yet under __free_pages(),
+        * or just got freed under us.
+        *
+        * In case we 'win' a race for a balloon page being freed under us and
+        * raise its refcount preventing __free_pages() from doing its job
+        * the put_page() at the end of this block will take care of
+        * release this page, thus avoiding a nasty leakage.
+        */
+       if (likely(get_page_unless_zero(page))) {
+               /*
+                * As balloon pages are not isolated from LRU lists, concurrent
+                * compaction threads can race against page migration functions
+                * as well as race against the balloon driver releasing a page.
+                *
+                * In order to avoid having an already isolated balloon page
+                * being (wrongly) re-isolated while it is under migration,
+                * or to avoid attempting to isolate pages being released by
+                * the balloon driver, lets be sure we have the page lock
+                * before proceeding with the balloon page isolation steps.
+                */
+               if (likely(trylock_page(page))) {
+                       /*
+                        * A ballooned page, by default, has just one refcount.
+                        * Prevent concurrent compaction threads from isolating
+                        * an already isolated balloon page by refcount check.
+                        */
+                       if (__is_movable_balloon_page(page) &&
+                           page_count(page) == 2) {
+                               __isolate_balloon_page(page);
+                               unlock_page(page);
+                               return true;
+                       }
+                       unlock_page(page);
+               }
+               put_page(page);
+       }
+       return false;
+}
+
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+       /*
+        * 'lock_page()' stabilizes the page and prevents races against
+        * concurrent isolation threads attempting to re-isolate it.
+        */
+       lock_page(page);
+
+       if (__is_movable_balloon_page(page)) {
+               __putback_balloon_page(page);
+               /* drop the extra ref count taken for page isolation */
+               put_page(page);
+       } else {
+               WARN_ON(1);
+               dump_page(page);
+       }
+       unlock_page(page);
+}
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+                        struct page *page, enum migrate_mode mode)
+{
+       struct address_space *mapping;
+       int rc = -EAGAIN;
+
+       /*
+        * Block others from accessing the 'newpage' when we get around to
+        * establishing additional references. We should be the only one
+        * holding a reference to the 'newpage' at this point.
+        */
+       BUG_ON(!trylock_page(newpage));
+
+       if (WARN_ON(!__is_movable_balloon_page(page))) {
+               dump_page(page);
+               unlock_page(newpage);
+               return rc;
+       }
+
+       mapping = page->mapping;
+       if (mapping)
+               rc = __migrate_balloon_page(mapping, newpage, page, mode);
+
+       unlock_page(newpage);
+       return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
index f468185..ecc4595 100644 (file)
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
  * @size: size of the range in bytes
  *
  * This is only useful when the bootmem allocator has already been torn
  * down, but we are still initializing the system.  Pages are given directly
  * to the page allocator, no bootmem metadata is updated because it is gone.
  */
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
 {
        unsigned long cursor, end;
 
-       kmemleak_free_part(__va(addr), size);
+       kmemleak_free_part(__va(physaddr), size);
 
-       cursor = PFN_UP(addr);
-       end = PFN_DOWN(addr + size);
+       cursor = PFN_UP(physaddr);
+       end = PFN_DOWN(physaddr + size);
 
        for (; cursor < end; cursor++) {
                __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 
 /**
  * free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
  * @size: size of the range in bytes
  *
  * Partial pages will be considered reserved and left as they are.
  *
  * The range must be contiguous but may span node boundaries.
  */
-void __init free_bootmem(unsigned long addr, unsigned long size)
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
 {
        unsigned long start, end;
 
-       kmemleak_free_part(__va(addr), size);
+       kmemleak_free_part(__va(physaddr), size);
 
-       start = PFN_UP(addr);
-       end = PFN_DOWN(addr + size);
+       start = PFN_UP(physaddr);
+       end = PFN_DOWN(physaddr + size);
 
        mark_bootmem(start, end, 0, 0);
 }
index 694eaab..d24dd2d 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
 #include "internal.h"
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        goto next_pageblock;
                }
 
-               /* Check may be lockless but that's ok as we recheck later */
-               if (!PageLRU(page))
+               /*
+                * Check may be lockless but that's ok as we recheck later.
+                * It's possible to migrate LRU pages and balloon pages
+                * Skip any other type of page
+                */
+               if (!PageLRU(page)) {
+                       if (unlikely(balloon_page_movable(page))) {
+                               if (locked && balloon_page_isolate(page)) {
+                                       /* Successfully isolated */
+                                       cc->finished_update_migrate = true;
+                                       list_add(&page->lru, migratelist);
+                                       cc->nr_migratepages++;
+                                       nr_isolated++;
+                                       goto check_compact_cluster;
+                               }
+                       }
                        continue;
+               }
 
                /*
                 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                cc->nr_migratepages++;
                nr_isolated++;
 
+check_compact_cluster:
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
                        ++low_pfn;
@@ -986,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_PARTIAL;
-                       putback_lru_pages(&cc->migratepages);
+                       putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
@@ -1009,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
                                                nr_remaining);
 
-               /* Release LRU pages not migrated */
+               /* Release isolated pages not migrated */
                if (err) {
-                       putback_lru_pages(&cc->migratepages);
+                       putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        if (err == -ENOMEM) {
                                ret = COMPACT_PARTIAL;
index da1b0f0..c69781e 100644 (file)
@@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        retval = offset + page->vaddr;
        *handle = offset + page->dma;
 #ifdef DMAPOOL_DEBUG
+       {
+               int i;
+               u8 *data = retval;
+               /* page->offset is stored in first 4 bytes */
+               for (i = sizeof(page->offset); i < pool->size; i++) {
+                       if (data[i] == POOL_POISON_FREED)
+                               continue;
+                       if (pool->dev)
+                               dev_err(pool->dev,
+                                       "dma_pool_alloc %s, %p (corruped)\n",
+                                       pool->name, retval);
+                       else
+                               pr_err("dma_pool_alloc %s, %p (corruped)\n",
+                                       pool->name, retval);
+
+                       /*
+                        * Dump the first 4 bytes even if they are not
+                        * POOL_POISON_FREED
+                        */
+                       print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+                                       data, pool->size, 1);
+                       break;
+               }
+       }
        memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
        spin_unlock_irqrestore(&pool->lock, flags);
index 2da13a5..d999077 100644 (file)
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr)
        unsigned long addr = (unsigned long)vaddr;
 
        if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
-               int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+               int i = PKMAP_NR(addr);
                return pte_page(pkmap_page_table[i]);
        }
 
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void)
                 * So no dangers, even with speculative execution.
                 */
                page = pte_page(pkmap_page_table[i]);
-               pte_clear(&init_mm, (unsigned long)page_address(page),
-                         &pkmap_page_table[i]);
+               pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
 
                set_page_address(page, NULL);
                need_flush = 1;
@@ -324,11 +323,7 @@ struct page_address_map {
        struct list_head list;
 };
 
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool;     /* freelist */
-static spinlock_t pool_lock;                   /* protects page_address_pool */
+static struct page_address_map page_address_maps[LAST_PKMAP];
 
 /*
  * Hash table bucket
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual)
 
        pas = page_slot(page);
        if (virtual) {          /* Add */
-               BUG_ON(list_empty(&page_address_pool));
-
-               spin_lock_irqsave(&pool_lock, flags);
-               pam = list_entry(page_address_pool.next,
-                               struct page_address_map, list);
-               list_del(&pam->list);
-               spin_unlock_irqrestore(&pool_lock, flags);
-
+               pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
                pam->page = page;
                pam->virtual = virtual;
 
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual)
                        if (pam->page == page) {
                                list_del(&pam->list);
                                spin_unlock_irqrestore(&pas->lock, flags);
-                               spin_lock_irqsave(&pool_lock, flags);
-                               list_add_tail(&pam->list, &page_address_pool);
-                               spin_unlock_irqrestore(&pool_lock, flags);
                                goto done;
                        }
                }
@@ -425,20 +410,14 @@ done:
        return;
 }
 
-static struct page_address_map page_address_maps[LAST_PKMAP];
-
 void __init page_address_init(void)
 {
        int i;
 
-       INIT_LIST_HEAD(&page_address_pool);
-       for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
-               list_add(&page_address_maps[i].list, &page_address_pool);
        for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
                INIT_LIST_HEAD(&page_address_htable[i].lh);
                spin_lock_init(&page_address_htable[i].lock);
        }
-       spin_lock_init(&pool_lock);
 }
 
 #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
index 40f17c3..5f902e2 100644 (file)
@@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
        return pmd;
 }
 
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+{
+       pmd_t entry;
+       entry = mk_pmd(page, vma->vm_page_prot);
+       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = pmd_mkhuge(entry);
+       return entry;
+}
+
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long haddr, pmd_t *pmd,
@@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
-               entry = mk_pmd(page, vma->vm_page_prot);
-               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               entry = pmd_mkhuge(entry);
+               entry = mk_huge_pmd(page, vma);
                /*
                 * The spinlocking to take the lru_lock inside
                 * page_add_new_anon_rmap() acts as a full memory
@@ -777,6 +784,28 @@ out:
        return ret;
 }
 
+void huge_pmd_set_accessed(struct mm_struct *mm,
+                          struct vm_area_struct *vma,
+                          unsigned long address,
+                          pmd_t *pmd, pmd_t orig_pmd,
+                          int dirty)
+{
+       pmd_t entry;
+       unsigned long haddr;
+
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(*pmd, orig_pmd)))
+               goto unlock;
+
+       entry = pmd_mkyoung(orig_pmd);
+       haddr = address & HPAGE_PMD_MASK;
+       if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+               update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+       spin_unlock(&mm->page_table_lock);
+}
+
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -951,9 +980,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
-               entry = mk_pmd(new_page, vma->vm_page_prot);
-               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               entry = pmd_mkhuge(entry);
+               entry = mk_huge_pmd(new_page, vma);
                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -1146,22 +1173,14 @@ pmd_t *page_check_address_pmd(struct page *page,
                              unsigned long address,
                              enum page_check_address_pmd_flag flag)
 {
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd, *ret = NULL;
 
        if (address & ~HPAGE_PMD_MASK)
                goto out;
 
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               goto out;
-
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
+       pmd = mm_find_pmd(mm, address);
+       if (!pmd)
                goto out;
-
-       pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto out;
        if (pmd_page(*pmd) != page)
@@ -1701,64 +1720,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
        }
 }
 
-static void release_all_pte_pages(pte_t *pte)
-{
-       release_pte_pages(pte, pte + HPAGE_PMD_NR);
-}
-
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte)
 {
        struct page *page;
        pte_t *_pte;
-       int referenced = 0, isolated = 0, none = 0;
+       int referenced = 0, none = 0;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval)) {
                        if (++none <= khugepaged_max_ptes_none)
                                continue;
-                       else {
-                               release_pte_pages(pte, _pte);
+                       else
                                goto out;
-                       }
                }
-               if (!pte_present(pteval) || !pte_write(pteval)) {
-                       release_pte_pages(pte, _pte);
+               if (!pte_present(pteval) || !pte_write(pteval))
                        goto out;
-               }
                page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page)) {
-                       release_pte_pages(pte, _pte);
+               if (unlikely(!page))
                        goto out;
-               }
+
                VM_BUG_ON(PageCompound(page));
                BUG_ON(!PageAnon(page));
                VM_BUG_ON(!PageSwapBacked(page));
 
                /* cannot use mapcount: can't collapse if there's a gup pin */
-               if (page_count(page) != 1) {
-                       release_pte_pages(pte, _pte);
+               if (page_count(page) != 1)
                        goto out;
-               }
                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
-               if (!trylock_page(page)) {
-                       release_pte_pages(pte, _pte);
+               if (!trylock_page(page))
                        goto out;
-               }
                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (isolate_lru_page(page)) {
                        unlock_page(page);
-                       release_pte_pages(pte, _pte);
                        goto out;
                }
                /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +1775,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = 1;
        }
-       if (unlikely(!referenced))
-               release_all_pte_pages(pte);
-       else
-               isolated = 1;
+       if (likely(referenced))
+               return 1;
 out:
-       return isolated;
+       release_pte_pages(pte, _pte);
+       return 0;
 }
 
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +1921,26 @@ static struct page
 }
 #endif
 
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+       if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+           (vma->vm_flags & VM_NOHUGEPAGE))
+               return false;
+
+       if (!vma->anon_vma || vma->vm_ops)
+               return false;
+       if (is_vma_temporary_stack(vma))
+               return false;
+       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+       return true;
+}
+
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
                                   struct vm_area_struct *vma,
                                   int node)
 {
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
@@ -1960,28 +1975,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
                goto out;
-
-       if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-           (vma->vm_flags & VM_NOHUGEPAGE))
-               goto out;
-
-       if (!vma->anon_vma || vma->vm_ops)
-               goto out;
-       if (is_vma_temporary_stack(vma))
+       if (!hugepage_vma_check(vma))
                goto out;
-       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
+       pmd = mm_find_pmd(mm, address);
+       if (!pmd)
                goto out;
-
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               goto out;
-
-       pmd = pmd_offset(pud, address);
-       /* pmd can't go away or become huge under us */
-       if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+       if (pmd_trans_huge(*pmd))
                goto out;
 
        anon_vma_lock(vma->anon_vma);
@@ -2028,9 +2027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
 
-       _pmd = mk_pmd(new_page, vma->vm_page_prot);
-       _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-       _pmd = pmd_mkhuge(_pmd);
+       _pmd = mk_huge_pmd(new_page, vma);
 
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2061,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage)
 {
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2071,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               goto out;
-
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
+       pmd = mm_find_pmd(mm, address);
+       if (!pmd)
                goto out;
-
-       pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+       if (pmd_trans_huge(*pmd))
                goto out;
 
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2182,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        progress++;
                        break;
                }
-
-               if ((!(vma->vm_flags & VM_HUGEPAGE) &&
-                    !khugepaged_always()) ||
-                   (vma->vm_flags & VM_NOHUGEPAGE)) {
-               skip:
+               if (!hugepage_vma_check(vma)) {
+skip:
                        progress++;
                        continue;
                }
-               if (!vma->anon_vma || vma->vm_ops)
-                       goto skip;
-               if (is_vma_temporary_stack(vma))
-                       goto skip;
-               VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
                if (hstart >= hend)
@@ -2379,22 +2359,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 static void split_huge_page_address(struct mm_struct *mm,
                                    unsigned long address)
 {
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd;
 
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               return;
-
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return;
-
-       pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       pmd = mm_find_pmd(mm, address);
+       if (!pmd)
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
index 59a0059..1ef2cd4 100644 (file)
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
         * remove hstate attributes from any nodes that have them.
         */
        for (nid = 0; nid < nr_node_ids; nid++)
-               hugetlb_unregister_node(&node_devices[nid]);
+               hugetlb_unregister_node(node_devices[nid]);
 }
 
 /*
@@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void)
        int nid;
 
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               struct node *node = &node_devices[nid];
+               struct node *node = node_devices[nid];
                if (node->dev.id == nid)
                        hugetlb_register_node(node);
        }
index a4fa284..52d1fa9 100644 (file)
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 
 /*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
  * in mm/page_alloc.c
  */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
index ae539f0..382d930 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                        struct page *kpage, pte_t orig_pte)
 {
        struct mm_struct *mm = vma->vm_mm;
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;
        spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
 
-       pgd = pgd_offset(mm, addr);
-       if (!pgd_present(*pgd))
+       pmd = mm_find_pmd(mm, addr);
+       if (!pmd)
                goto out;
-
-       pud = pud_offset(pgd, addr);
-       if (!pud_present(*pud))
-               goto out;
-
-       pmd = pmd_offset(pud, addr);
        BUG_ON(pmd_trans_huge(*pmd));
-       if (!pmd_present(*pmd))
-               goto out;
 
        mmun_start = addr;
        mmun_end   = addr + PAGE_SIZE;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
-                       int oom_score_adj;
-
-                       oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+                       set_current_oom_origin();
                        err = unmerge_and_remove_all_rmap_items();
-                       compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
-                                                               oom_score_adj);
+                       clear_current_oom_origin();
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
index dd39ba0..cf6d0df 100644 (file)
@@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return limit;
 }
 
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                             int order)
+static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                                    int order)
 {
        struct mem_cgroup *iter;
        unsigned long chosen_points = 0;
index 8b20278..108c52f 100644 (file)
@@ -781,16 +781,16 @@ static struct page_state {
        { compound,     compound,       "huge",         me_huge_page },
 #endif
 
-       { sc|dirty,     sc|dirty,       "swapcache",    me_swapcache_dirty },
-       { sc|dirty,     sc,             "swapcache",    me_swapcache_clean },
+       { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
+       { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
 
-       { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
-       { unevict,      unevict,        "unevictable LRU", me_pagecache_clean},
+       { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+       { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
 
-       { mlock|dirty,  mlock|dirty,    "mlocked LRU",  me_pagecache_dirty },
-       { mlock,        mlock,          "mlocked LRU",  me_pagecache_clean },
+       { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
+       { mlock,        mlock,          "clean mlocked LRU",    me_pagecache_clean },
 
-       { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
+       { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
 
        /*
@@ -812,14 +812,14 @@ static struct page_state {
 #undef slab
 #undef reserved
 
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
 static void action_result(unsigned long pfn, char *msg, int result)
 {
-       struct page *page = pfn_to_page(pfn);
-
-       printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
-               pfn,
-               PageDirty(page) ? "dirty " : "",
-               msg, action_name[result]);
+       pr_err("MCE %#lx: %s page recovery: %s\n",
+               pfn, msg, action_name[result]);
 }
 
 static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         * Isolate the page, so that it doesn't get reallocated if it
         * was free.
         */
-       set_migratetype_isolate(p);
+       set_migratetype_isolate(p, true);
        /*
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
index 221fc9f..7653773 100644 (file)
@@ -3537,8 +3537,9 @@ retry:
 
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
-                       if (flags & FAULT_FLAG_WRITE &&
-                           !pmd_write(orig_pmd) &&
+                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+                       if (dirty && !pmd_write(orig_pmd) &&
                            !pmd_trans_splitting(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
@@ -3550,6 +3551,9 @@ retry:
                                if (unlikely(ret & VM_FAULT_OOM))
                                        goto retry;
                                return ret;
+                       } else {
+                               huge_pmd_set_accessed(mm, vma, address, pmd,
+                                                     orig_pmd, dirty);
                        }
                        return 0;
                }
index e4eeaca..de9cb14 100644 (file)
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writelock(zone);
 
        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-       if (start_pfn < zone->zone_start_pfn)
+       if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
                zone->zone_start_pfn = start_pfn;
 
        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writeunlock(zone);
 }
 
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+               unsigned long end_pfn)
+{
+       zone_span_writelock(zone);
+
+       if (end_pfn - start_pfn) {
+               zone->zone_start_pfn = start_pfn;
+               zone->spanned_pages = end_pfn - start_pfn;
+       } else {
+               /*
+                * make it consist as free_area_init_core(),
+                * if spanned_pages = 0, then keep start_pfn = 0
+                */
+               zone->zone_start_pfn = 0;
+               zone->spanned_pages = 0;
+       }
+
+       zone_span_writeunlock(zone);
+}
+
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+               unsigned long end_pfn)
+{
+       enum zone_type zid = zone_idx(zone);
+       int nid = zone->zone_pgdat->node_id;
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++)
+               set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       int ret;
+       unsigned long flags;
+       unsigned long z1_start_pfn;
+
+       if (!z1->wait_table) {
+               ret = init_currently_empty_zone(z1, start_pfn,
+                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
+               if (ret)
+                       return ret;
+       }
+
+       pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+       /* can't move pfns which are higher than @z2 */
+       if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+               goto out_fail;
+       /* the move out part mast at the left most of @z2 */
+       if (start_pfn > z2->zone_start_pfn)
+               goto out_fail;
+       /* must included/overlap */
+       if (end_pfn <= z2->zone_start_pfn)
+               goto out_fail;
+
+       /* use start_pfn for z1's start_pfn if z1 is empty */
+       if (z1->spanned_pages)
+               z1_start_pfn = z1->zone_start_pfn;
+       else
+               z1_start_pfn = start_pfn;
+
+       resize_zone(z1, z1_start_pfn, end_pfn);
+       resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+       fix_zone_id(z1, start_pfn, end_pfn);
+
+       return 0;
+out_fail:
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+       return -1;
+}
+
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       int ret;
+       unsigned long flags;
+       unsigned long z2_end_pfn;
+
+       if (!z2->wait_table) {
+               ret = init_currently_empty_zone(z2, start_pfn,
+                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
+               if (ret)
+                       return ret;
+       }
+
+       pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+       /* can't move pfns which are lower than @z1 */
+       if (z1->zone_start_pfn > start_pfn)
+               goto out_fail;
+       /* the move out part mast at the right most of @z1 */
+       if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+               goto out_fail;
+       /* must included/overlap */
+       if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+               goto out_fail;
+
+       /* use end_pfn for z2's end_pfn if z2 is empty */
+       if (z2->spanned_pages)
+               z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+       else
+               z2_end_pfn = end_pfn;
+
+       resize_zone(z1, z1->zone_start_pfn, start_pfn);
+       resize_zone(z2, start_pfn, z2_end_pfn);
+
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+       fix_zone_id(z2, start_pfn, end_pfn);
+
+       return 0;
+out_fail:
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+       return -1;
+}
+
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                            unsigned long end_pfn)
 {
        unsigned long old_pgdat_end_pfn =
                pgdat->node_start_pfn + pgdat->node_spanned_pages;
 
-       if (start_pfn < pgdat->node_start_pfn)
+       if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
                pgdat->node_start_pfn = start_pfn;
 
        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +581,61 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
        return 0;
 }
 
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+       return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+       struct zone *zone, struct memory_notify *arg)
+{
+       int nid = zone_to_nid(zone);
+       enum zone_type zone_last = ZONE_NORMAL;
+
+       /*
+        * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+        * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
+        *
+        * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+        * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+        */
+       if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       /*
+        * if the memory to be online is in a zone of 0...zone_last, and
+        * the zones of 0...zone_last don't have memory before online, we will
+        * need to set the node to node_states[N_NORMAL_MEMORY] after
+        * the memory is online.
+        */
+       if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+               arg->status_change_nid_normal = nid;
+       else
+               arg->status_change_nid_normal = -1;
+
+       /*
+        * if the node don't have memory befor online, we will need to
+        * set the node to node_states[N_HIGH_MEMORY] after the memory
+        * is online.
+        */
+       if (!node_state(nid, N_HIGH_MEMORY))
+               arg->status_change_nid = nid;
+       else
+               arg->status_change_nid = -1;
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+       if (arg->status_change_nid_normal >= 0)
+               node_set_state(node, N_NORMAL_MEMORY);
+
+       node_set_state(node, N_HIGH_MEMORY);
+}
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long onlined_pages = 0;
        struct zone *zone;
@@ -471,13 +645,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        struct memory_notify arg;
 
        lock_memory_hotplug();
+       /*
+        * This doesn't need a lock to do pfn_to_page().
+        * The section can't be removed here because of the
+        * memory_block->state_mutex.
+        */
+       zone = page_zone(pfn_to_page(pfn));
+
+       if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+           !can_online_high_movable(zone)) {
+               unlock_memory_hotplug();
+               return -1;
+       }
+
+       if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+               if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+                       unlock_memory_hotplug();
+                       return -1;
+               }
+       }
+       if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+               if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+                       unlock_memory_hotplug();
+                       return -1;
+               }
+       }
+
+       /* Previous code may changed the zone of the pfn range */
+       zone = page_zone(pfn_to_page(pfn));
+
        arg.start_pfn = pfn;
        arg.nr_pages = nr_pages;
-       arg.status_change_nid = -1;
+       node_states_check_changes_online(nr_pages, zone, &arg);
 
        nid = page_to_nid(pfn_to_page(pfn));
-       if (node_present_pages(nid) == 0)
-               arg.status_change_nid = nid;
 
        ret = memory_notify(MEM_GOING_ONLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -487,23 +688,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                return ret;
        }
        /*
-        * This doesn't need a lock to do pfn_to_page().
-        * The section can't be removed here because of the
-        * memory_block->state_mutex.
-        */
-       zone = page_zone(pfn_to_page(pfn));
-       /*
         * If this zone is not populated, then it is not in zonelist.
         * This means the page allocator ignores this zone.
         * So, zonelist must be updated after online.
         */
        mutex_lock(&zonelists_mutex);
-       if (!populated_zone(zone))
+       if (!populated_zone(zone)) {
                need_zonelists_rebuild = 1;
+               build_all_zonelists(NULL, zone);
+       }
 
        ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
                online_pages_range);
        if (ret) {
+               if (need_zonelists_rebuild)
+                       zone_pcp_reset(zone);
                mutex_unlock(&zonelists_mutex);
                printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
                       (unsigned long long) pfn << PAGE_SHIFT,
@@ -517,9 +716,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        if (onlined_pages) {
-               node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+               node_states_set_node(zone_to_nid(zone), &arg);
                if (need_zonelists_rebuild)
-                       build_all_zonelists(NULL, zone);
+                       build_all_zonelists(NULL, NULL);
                else
                        zone_pcp_update(zone);
        }
@@ -847,7 +1046,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
 {
        int ret;
        long offlined = *(long *)data;
-       ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+       ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
        offlined = nr_pages;
        if (!ret)
                *(long *)data += offlined;
@@ -867,6 +1066,91 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
 
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       unsigned long present_pages = 0;
+       enum zone_type zt;
+
+       for (zt = 0; zt <= ZONE_NORMAL; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+
+       if (present_pages > nr_pages)
+               return true;
+
+       present_pages = 0;
+       for (; zt <= ZONE_MOVABLE; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+
+       /*
+        * we can't offline the last normal memory until all
+        * higher memory is offlined.
+        */
+       return present_pages == 0;
+}
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+               struct zone *zone, struct memory_notify *arg)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       unsigned long present_pages = 0;
+       enum zone_type zt, zone_last = ZONE_NORMAL;
+
+       /*
+        * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+        * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
+        *
+        * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+        * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+        */
+       if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       /*
+        * check whether node_states[N_NORMAL_MEMORY] will be changed.
+        * If the memory to be offline is in a zone of 0...zone_last,
+        * and it is the last present memory, 0...zone_last will
+        * become empty after offline , thus we can determind we will
+        * need to clear the node from node_states[N_NORMAL_MEMORY].
+        */
+       for (zt = 0; zt <= zone_last; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+       if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+               arg->status_change_nid_normal = zone_to_nid(zone);
+       else
+               arg->status_change_nid_normal = -1;
+
+       /*
+        * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+        */
+       zone_last = ZONE_MOVABLE;
+
+       /*
+        * check whether node_states[N_HIGH_MEMORY] will be changed
+        * If we try to offline the last present @nr_pages from the node,
+        * we can determind we will need to clear the node from
+        * node_states[N_HIGH_MEMORY].
+        */
+       for (; zt <= zone_last; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+       if (nr_pages >= present_pages)
+               arg->status_change_nid = zone_to_nid(zone);
+       else
+               arg->status_change_nid = -1;
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+       if (arg->status_change_nid_normal >= 0)
+               node_clear_state(node, N_NORMAL_MEMORY);
+
+       if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
+           (arg->status_change_nid >= 0))
+               node_clear_state(node, N_HIGH_MEMORY);
+}
+
 static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
@@ -893,16 +1177,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
 
+       ret = -EINVAL;
+       if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+               goto out;
+
        /* set above range as isolated */
-       ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+       ret = start_isolate_page_range(start_pfn, end_pfn,
+                                      MIGRATE_MOVABLE, true);
        if (ret)
                goto out;
 
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
-       arg.status_change_nid = -1;
-       if (nr_pages >= node_present_pages(node))
-               arg.status_change_nid = node;
+       node_states_check_changes_offline(nr_pages, zone, &arg);
 
        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -975,10 +1262,9 @@ repeat:
        } else
                zone_pcp_update(zone);
 
-       if (!node_present_pages(node)) {
-               node_clear_state(node, N_HIGH_MEMORY);
+       node_states_clear_node(node, &arg);
+       if (arg.status_change_nid >= 0)
                kswapd_stop(node);
-       }
 
        vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
index 4ea600d..05b2836 100644 (file)
@@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
        struct mempolicy *pol;
-       struct zonelist *zl;
        struct page *page;
        unsigned int cpuset_mems_cookie;
 
@@ -1926,23 +1925,11 @@ retry_cpuset:
 
                return page;
        }
-       zl = policy_zonelist(gfp, pol, node);
-       if (unlikely(mpol_needs_cond_ref(pol))) {
-               /*
-                * slow path: ref counted shared policy
-                */
-               struct page *page =  __alloc_pages_nodemask(gfp, order,
-                                               zl, policy_nodemask(gfp, pol));
-               __mpol_put(pol);
-               if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
-                       goto retry_cpuset;
-               return page;
-       }
-       /*
-        * fast path:  default or task policy
-        */
-       page = __alloc_pages_nodemask(gfp, order, zl,
+       page = __alloc_pages_nodemask(gfp, order,
+                                     policy_zonelist(gfp, pol, node),
                                      policy_nodemask(gfp, pol));
+       if (unlikely(mpol_needs_cond_ref(pol)))
+               __mpol_put(pol);
        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
                goto retry_cpuset;
        return page;
index 77ed2d7..3f675ca 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
 
 #include <asm/tlbflush.h>
 
@@ -79,7 +80,30 @@ void putback_lru_pages(struct list_head *l)
                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
-               putback_lru_page(page);
+                       putback_lru_page(page);
+       }
+}
+
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+       struct page *page;
+       struct page *page2;
+
+       list_for_each_entry_safe(page, page2, l, lru) {
+               list_del(&page->lru);
+               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                               page_is_file_cache(page));
+               if (unlikely(balloon_page_movable(page)))
+                       balloon_page_putback(page);
+               else
+                       putback_lru_page(page);
        }
 }
 
@@ -91,8 +115,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        swp_entry_t entry;
-       pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;
        spinlock_t *ptl;
@@ -103,19 +125,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                        goto out;
                ptl = &mm->page_table_lock;
        } else {
-               pgd = pgd_offset(mm, addr);
-               if (!pgd_present(*pgd))
-                       goto out;
-
-               pud = pud_offset(pgd, addr);
-               if (!pud_present(*pud))
+               pmd = mm_find_pmd(mm, addr);
+               if (!pmd)
                        goto out;
-
-               pmd = pmd_offset(pud, addr);
                if (pmd_trans_huge(*pmd))
                        goto out;
-               if (!pmd_present(*pmd))
-                       goto out;
 
                ptep = pte_offset_map(pmd, addr);
 
@@ -286,7 +300,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                /* Anonymous page without mapping */
                if (page_count(page) != 1)
                        return -EAGAIN;
-               return 0;
+               return MIGRATEPAGE_SUCCESS;
        }
 
        spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +370,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        }
        spin_unlock_irq(&mapping->tree_lock);
 
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
 }
 
 /*
@@ -372,7 +386,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        if (!mapping) {
                if (page_count(page) != 1)
                        return -EAGAIN;
-               return 0;
+               return MIGRATEPAGE_SUCCESS;
        }
 
        spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +413,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        page_unfreeze_refs(page, expected_count - 1);
 
        spin_unlock_irq(&mapping->tree_lock);
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
 }
 
 /*
@@ -486,11 +500,11 @@ int migrate_page(struct address_space *mapping,
 
        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
 
-       if (rc)
+       if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
 
        migrate_page_copy(newpage, page);
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(migrate_page);
 
@@ -513,7 +527,7 @@ int buffer_migrate_page(struct address_space *mapping,
 
        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
 
-       if (rc)
+       if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
 
        /*
@@ -549,7 +563,7 @@ int buffer_migrate_page(struct address_space *mapping,
 
        } while (bh != head);
 
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 #endif
@@ -628,7 +642,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  *
  * Return value:
  *   < 0 - error code
- *  == 0 - success
+ *  MIGRATEPAGE_SUCCESS - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
                                int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +679,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        else
                rc = fallback_migrate_page(mapping, newpage, page, mode);
 
-       if (rc) {
+       if (rc != MIGRATEPAGE_SUCCESS) {
                newpage->mapping = NULL;
        } else {
                if (remap_swapcache)
@@ -778,6 +792,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                }
        }
 
+       if (unlikely(balloon_page_movable(page))) {
+               /*
+                * A ballooned page does not need any special attention from
+                * physical to virtual reverse mapping procedures.
+                * Skip any attempt to unmap PTEs or to remap swap cache,
+                * in order to avoid burning cycles at rmap level, and perform
+                * the page migration right away (proteced by page lock).
+                */
+               rc = balloon_page_migrate(newpage, page, mode);
+               goto uncharge;
+       }
+
        /*
         * Corner case handling:
         * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +840,9 @@ skip_unmap:
                put_anon_vma(anon_vma);
 
 uncharge:
-       mem_cgroup_end_migration(mem, page, newpage, rc == 0);
+       mem_cgroup_end_migration(mem, page, newpage,
+                                (rc == MIGRATEPAGE_SUCCESS ||
+                                 rc == MIGRATEPAGE_BALLOON_SUCCESS));
 unlock:
        unlock_page(page);
 out:
@@ -846,6 +874,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                        goto out;
 
        rc = __unmap_and_move(page, newpage, force, offlining, mode);
+
+       if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+               /*
+                * A ballooned page has been migrated already.
+                * Now, it's the time to wrap-up counters,
+                * handle the page back to Buddy and return.
+                */
+               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                   page_is_file_cache(page));
+               balloon_page_free(page);
+               return MIGRATEPAGE_SUCCESS;
+       }
 out:
        if (rc != -EAGAIN) {
                /*
@@ -987,7 +1027,7 @@ int migrate_pages(struct list_head *from,
                        case -EAGAIN:
                                retry++;
                                break;
-                       case 0:
+                       case MIGRATEPAGE_SUCCESS:
                                break;
                        default:
                                /* Permanent failure */
@@ -996,15 +1036,12 @@ int migrate_pages(struct list_head *from,
                        }
                }
        }
-       rc = 0;
+       rc = nr_failed + retry;
 out:
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
 
-       if (rc)
-               return rc;
-
-       return nr_failed + retry;
+       return rc;
 }
 
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1061,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
                        /* try again */
                        cond_resched();
                        break;
-               case 0:
+               case MIGRATEPAGE_SUCCESS:
                        goto out;
                default:
                        rc = -EIO;
index 7d41605..f940062 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -311,40 +312,88 @@ out:
        return retval;
 }
 
+static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+       unsigned long max, subtree_gap;
+       max = vma->vm_start;
+       if (vma->vm_prev)
+               max -= vma->vm_prev->vm_end;
+       if (vma->vm_rb.rb_left) {
+               subtree_gap = rb_entry(vma->vm_rb.rb_left,
+                               struct vm_area_struct, vm_rb)->rb_subtree_gap;
+               if (subtree_gap > max)
+                       max = subtree_gap;
+       }
+       if (vma->vm_rb.rb_right) {
+               subtree_gap = rb_entry(vma->vm_rb.rb_right,
+                               struct vm_area_struct, vm_rb)->rb_subtree_gap;
+               if (subtree_gap > max)
+                       max = subtree_gap;
+       }
+       return max;
+}
+
 #ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
-       int i = 0, j;
+       int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;
 
        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
-               if (vma->vm_start < prev)
-                       printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
-               if (vma->vm_start < pend)
+               if (vma->vm_start < prev) {
+                       printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+                       bug = 1;
+               }
+               if (vma->vm_start < pend) {
                        printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
-               if (vma->vm_start > vma->vm_end)
-                       printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+                       bug = 1;
+               }
+               if (vma->vm_start > vma->vm_end) {
+                       printk("vm_end %lx < vm_start %lx\n",
+                               vma->vm_end, vma->vm_start);
+                       bug = 1;
+               }
+               if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+                       printk("free gap %lx, correct %lx\n",
+                              vma->rb_subtree_gap,
+                              vma_compute_subtree_gap(vma));
+                       bug = 1;
+               }
                i++;
                pn = nd;
                prev = vma->vm_start;
                pend = vma->vm_end;
        }
        j = 0;
-       for (nd = pn; nd; nd = rb_prev(nd)) {
+       for (nd = pn; nd; nd = rb_prev(nd))
                j++;
+       if (i != j) {
+               printk("backwards %d, forwards %d\n", j, i);
+               bug = 1;
+       }
+       return bug ? -1 : i;
+}
+
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+       struct rb_node *nd;
+
+       for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+               struct vm_area_struct *vma;
+               vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+               BUG_ON(vma != ignore &&
+                      vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
        }
-       if (i != j)
-               printk("backwards %d, forwards %d\n", j, i), i = 0;
-       return i;
 }
 
 void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
+       unsigned long highest_address = 0;
        struct vm_area_struct *vma = mm->mmap;
        while (vma) {
                struct anon_vma_chain *avc;
@@ -352,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                        anon_vma_interval_tree_verify(avc);
                vma_unlock_anon_vma(vma);
+               highest_address = vma->vm_end;
                vma = vma->vm_next;
                i++;
        }
-       if (i != mm->map_count)
-               printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+       if (i != mm->map_count) {
+               printk("map_count %d vm_next %d\n", mm->map_count, i);
+               bug = 1;
+       }
+       if (highest_address != mm->highest_vm_end) {
+               printk("mm->highest_vm_end %lx, found %lx\n",
+                      mm->highest_vm_end, highest_address);
+               bug = 1;
+       }
        i = browse_rb(&mm->mm_rb);
-       if (i != mm->map_count)
-               printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+       if (i != mm->map_count) {
+               printk("map_count %d rb %d\n", mm->map_count, i);
+               bug = 1;
+       }
        BUG_ON(bug);
 }
 #else
+#define validate_mm_rb(root, ignore) do { } while (0)
 #define validate_mm(mm) do { } while (0)
 #endif
 
+RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
+                    unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
+{
+       /*
+        * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
+        * function that does exacltly what we want.
+        */
+       vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+                                struct rb_root *root)
+{
+       /* All rb_subtree_gap values must be consistent prior to insertion */
+       validate_mm_rb(root, NULL);
+
+       rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+       /*
+        * All rb_subtree_gap values must be consistent prior to erase,
+        * with the possible exception of the vma being erased.
+        */
+       validate_mm_rb(root, vma);
+
+       /*
+        * Note rb_erase_augmented is a fairly large inline function,
+        * so make sure we instantiate it only once with our desired
+        * augmented rbtree callbacks.
+        */
+       rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
 /*
  * vma has some anon_vma assigned, and is already inserted on that
  * anon_vma's interval trees.
@@ -435,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+       /* Update tracking information for the gap following the new vma. */
+       if (vma->vm_next)
+               vma_gap_update(vma->vm_next);
+       else
+               mm->highest_vm_end = vma->vm_end;
+
+       /*
+        * vma->vm_prev wasn't known when we followed the rbtree to find the
+        * correct insertion point for that vma. As a result, we could not
+        * update the vma vm_rb parents rb_subtree_gap values on the way down.
+        * So, we first insert the vma with a zero rb_subtree_gap value
+        * (to be consistent with what we did on the way down), and then
+        * immediately update the gap to the correct value. Finally we
+        * rebalance the rbtree after all augmented values have been set.
+        */
        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
-       rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+       vma->rb_subtree_gap = 0;
+       vma_gap_update(vma);
+       vma_rb_insert(vma, &mm->mm_rb);
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -512,12 +631,12 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev)
 {
-       struct vm_area_struct *next = vma->vm_next;
+       struct vm_area_struct *next;
 
-       prev->vm_next = next;
+       vma_rb_erase(vma, &mm->mm_rb);
+       prev->vm_next = next = vma->vm_next;
        if (next)
                next->vm_prev = prev;
-       rb_erase(&vma->vm_rb, &mm->mm_rb);
        if (mm->mmap_cache == vma)
                mm->mmap_cache = prev;
 }
@@ -539,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
+       bool start_changed = false, end_changed = false;
        long adjust_next = 0;
        int remove_next = 0;
 
@@ -629,8 +749,14 @@ again:                     remove_next = 1 + (end > next->vm_end);
                        vma_interval_tree_remove(next, root);
        }
 
-       vma->vm_start = start;
-       vma->vm_end = end;
+       if (start != vma->vm_start) {
+               vma->vm_start = start;
+               start_changed = true;
+       }
+       if (end != vma->vm_end) {
+               vma->vm_end = end;
+               end_changed = true;
+       }
        vma->vm_pgoff = pgoff;
        if (adjust_next) {
                next->vm_start += adjust_next << PAGE_SHIFT;
@@ -659,6 +785,15 @@ again:                     remove_next = 1 + (end > next->vm_end);
                 * (it may either follow vma or precede it).
                 */
                __insert_vm_struct(mm, insert);
+       } else {
+               if (start_changed)
+                       vma_gap_update(vma);
+               if (end_changed) {
+                       if (!next)
+                               mm->highest_vm_end = end;
+                       else if (!adjust_next)
+                               vma_gap_update(next);
+               }
        }
 
        if (anon_vma) {
@@ -692,10 +827,13 @@ again:                    remove_next = 1 + (end > next->vm_end);
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
-               if (remove_next == 2) {
-                       next = vma->vm_next;
+               next = vma->vm_next;
+               if (remove_next == 2)
                        goto again;
-               }
+               else if (next)
+                       vma_gap_update(next);
+               else
+                       mm->highest_vm_end = end;
        }
        if (insert && file)
                uprobe_mmap(insert);
@@ -1167,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 * memory so no accounting is necessary
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
-                                               VM_NORESERVE, &user,
-                                               HUGETLB_ANONHUGE_INODE);
+                               VM_NORESERVE,
+                               &user, HUGETLB_ANONHUGE_INODE,
+                               (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }
@@ -1414,6 +1553,206 @@ unacct_error:
        return error;
 }
 
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+       /*
+        * We implement the search by looking for an rbtree node that
+        * immediately follows a suitable gap. That is,
+        * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+        * - gap_end   = vma->vm_start        >= info->low_limit  + length;
+        * - gap_end - gap_start >= length
+        */
+
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+       /* Adjust search length to account for worst case alignment overhead */
+       length = info->length + info->align_mask;
+       if (length < info->length)
+               return -ENOMEM;
+
+       /* Adjust search limits by the desired length */
+       if (info->high_limit < length)
+               return -ENOMEM;
+       high_limit = info->high_limit - length;
+
+       if (info->low_limit > high_limit)
+               return -ENOMEM;
+       low_limit = info->low_limit + length;
+
+       /* Check if rbtree root looks promising */
+       if (RB_EMPTY_ROOT(&mm->mm_rb))
+               goto check_highest;
+       vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+       if (vma->rb_subtree_gap < length)
+               goto check_highest;
+
+       while (true) {
+               /* Visit left subtree if it looks promising */
+               gap_end = vma->vm_start;
+               if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+                       struct vm_area_struct *left =
+                               rb_entry(vma->vm_rb.rb_left,
+                                        struct vm_area_struct, vm_rb);
+                       if (left->rb_subtree_gap >= length) {
+                               vma = left;
+                               continue;
+                       }
+               }
+
+               gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+check_current:
+               /* Check if current node has a suitable gap */
+               if (gap_start > high_limit)
+                       return -ENOMEM;
+               if (gap_end >= low_limit && gap_end - gap_start >= length)
+                       goto found;
+
+               /* Visit right subtree if it looks promising */
+               if (vma->vm_rb.rb_right) {
+                       struct vm_area_struct *right =
+                               rb_entry(vma->vm_rb.rb_right,
+                                        struct vm_area_struct, vm_rb);
+                       if (right->rb_subtree_gap >= length) {
+                               vma = right;
+                               continue;
+                       }
+               }
+
+               /* Go back up the rbtree to find next candidate node */
+               while (true) {
+                       struct rb_node *prev = &vma->vm_rb;
+                       if (!rb_parent(prev))
+                               goto check_highest;
+                       vma = rb_entry(rb_parent(prev),
+                                      struct vm_area_struct, vm_rb);
+                       if (prev == vma->vm_rb.rb_left) {
+                               gap_start = vma->vm_prev->vm_end;
+                               gap_end = vma->vm_start;
+                               goto check_current;
+                       }
+               }
+       }
+
+check_highest:
+       /* Check highest gap, which does not precede any rbtree node */
+       gap_start = mm->highest_vm_end;
+       gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
+       if (gap_start > high_limit)
+               return -ENOMEM;
+
+found:
+       /* We found a suitable gap. Clip it with the original low_limit. */
+       if (gap_start < info->low_limit)
+               gap_start = info->low_limit;
+
+       /* Adjust gap address to the desired alignment */
+       gap_start += (info->align_offset - gap_start) & info->align_mask;
+
+       VM_BUG_ON(gap_start + info->length > info->high_limit);
+       VM_BUG_ON(gap_start + info->length > gap_end);
+       return gap_start;
+}
+
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+       /* Adjust search length to account for worst case alignment overhead */
+       length = info->length + info->align_mask;
+       if (length < info->length)
+               return -ENOMEM;
+
+       /*
+        * Adjust search limits by the desired length.
+        * See implementation comment at top of unmapped_area().
+        */
+       gap_end = info->high_limit;
+       if (gap_end < length)
+               return -ENOMEM;
+       high_limit = gap_end - length;
+
+       if (info->low_limit > high_limit)
+               return -ENOMEM;
+       low_limit = info->low_limit + length;
+
+       /* Check highest gap, which does not precede any rbtree node */
+       gap_start = mm->highest_vm_end;
+       if (gap_start <= high_limit)
+               goto found_highest;
+
+       /* Check if rbtree root looks promising */
+       if (RB_EMPTY_ROOT(&mm->mm_rb))
+               return -ENOMEM;
+       vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+       if (vma->rb_subtree_gap < length)
+               return -ENOMEM;
+
+       while (true) {
+               /* Visit right subtree if it looks promising */
+               gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+               if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+                       struct vm_area_struct *right =
+                               rb_entry(vma->vm_rb.rb_right,
+                                        struct vm_area_struct, vm_rb);
+                       if (right->rb_subtree_gap >= length) {
+                               vma = right;
+                               continue;
+                       }
+               }
+
+check_current:
+               /* Check if current node has a suitable gap */
+               gap_end = vma->vm_start;
+               if (gap_end < low_limit)
+                       return -ENOMEM;
+               if (gap_start <= high_limit && gap_end - gap_start >= length)
+                       goto found;
+
+               /* Visit left subtree if it looks promising */
+               if (vma->vm_rb.rb_left) {
+                       struct vm_area_struct *left =
+                               rb_entry(vma->vm_rb.rb_left,
+                                        struct vm_area_struct, vm_rb);
+                       if (left->rb_subtree_gap >= length) {
+                               vma = left;
+                               continue;
+                       }
+               }
+
+               /* Go back up the rbtree to find next candidate node */
+               while (true) {
+                       struct rb_node *prev = &vma->vm_rb;
+                       if (!rb_parent(prev))
+                               return -ENOMEM;
+                       vma = rb_entry(rb_parent(prev),
+                                      struct vm_area_struct, vm_rb);
+                       if (prev == vma->vm_rb.rb_right) {
+                               gap_start = vma->vm_prev ?
+                                       vma->vm_prev->vm_end : 0;
+                               goto check_current;
+                       }
+               }
+       }
+
+found:
+       /* We found a suitable gap. Clip it with the original high_limit. */
+       if (gap_end > info->high_limit)
+               gap_end = info->high_limit;
+
+found_highest:
+       /* Compute highest gap address at the desired alignment */
+       gap_end -= info->length;
+       gap_end -= (gap_end - info->align_offset) & info->align_mask;
+
+       VM_BUG_ON(gap_end < info->low_limit);
+       VM_BUG_ON(gap_end < gap_start);
+       return gap_end;
+}
+
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
@@ -1432,7 +1771,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-       unsigned long start_addr;
+       struct vm_unmapped_area_info info;
 
        if (len > TASK_SIZE)
                return -ENOMEM;
@@ -1447,40 +1786,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               start_addr = addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
 
-full_search:
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               addr = TASK_UNMAPPED_BASE;
-                               start_addr = addr;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = vma->vm_end;
-       }
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = TASK_UNMAPPED_BASE;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = 0;
+       return vm_unmapped_area(&info);
 }
 #endif 
 
@@ -1505,7 +1817,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-       unsigned long addr = addr0, start_addr;
+       unsigned long addr = addr0;
+       struct vm_unmapped_area_info info;
 
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -1523,53 +1836,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
-       /* check if free_area_cache is useful for us */
-       if (len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = mm->mmap_base;
-       }
-
-try_again:
-       /* either no address requested or can't fit in requested address hole */
-       start_addr = addr = mm->free_area_cache;
-
-       if (addr < len)
-               goto fail;
-
-       addr -= len;
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * else if new region fits below vma->vm_start,
-                * return with success:
-                */
-               vma = find_vma(mm, addr);
-               if (!vma || addr+len <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
-
-               /* remember the largest hole we saw so far */
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start-len;
-       } while (len < vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        *
-        * Note: this is different with the case of bottomup
-        * which does the fully line-search, but we use find_vma
-        * here that causes some holes skipped.
-        */
-       if (start_addr != mm->mmap_base) {
-               mm->free_area_cache = mm->mmap_base;
-               mm->cached_hole_size = 0;
-               goto try_again;
-       }
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = mm->mmap_base;
+       info.align_mask = 0;
+       addr = vm_unmapped_area(&info);
 
        /*
         * A failed mmap() very likely causes application failure,
@@ -1577,14 +1849,13 @@ fail:
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       mm->cached_hole_size = ~0UL;
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = mm->mmap_base;
-       mm->cached_hole_size = ~0UL;
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
 
        return addr;
 }
@@ -1797,6 +2068,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
+                               if (vma->vm_next)
+                                       vma_gap_update(vma->vm_next);
+                               else
+                                       vma->vm_mm->highest_vm_end = address;
                                perf_event_mmap(vma);
                        }
                }
@@ -1851,6 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma,
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                anon_vma_interval_tree_post_update_vma(vma);
+                               vma_gap_update(vma);
                                perf_event_mmap(vma);
                        }
                }
@@ -1973,14 +2249,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
        do {
-               rb_erase(&vma->vm_rb, &mm->mm_rb);
+               vma_rb_erase(vma, &mm->mm_rb);
                mm->map_count--;
                tail_vma = vma;
                vma = vma->vm_next;
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
-       if (vma)
+       if (vma) {
                vma->vm_prev = prev;
+               vma_gap_update(vma);
+       } else
+               mm->highest_vm_end = prev ? prev->vm_end : 0;
        tail_vma->vm_next = NULL;
        if (mm->unmap_area == arch_unmap_area)
                addr = prev ? prev->vm_end : mm->mmap_base;
index 79e0f3e..18f1ae2 100644 (file)
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
 
-/*
- * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
- * @old_val: old oom_score_adj for compare
- * @new_val: new oom_score_adj for swap
- *
- * Sets the oom_score_adj value for current to @new_val iff its present value is
- * @old_val.  Usually used to reinstate a previous value to prevent racing with
- * userspacing tuning the value in the interim.
- */
-void compare_swap_oom_score_adj(int old_val, int new_val)
-{
-       struct sighand_struct *sighand = current->sighand;
-
-       spin_lock_irq(&sighand->siglock);
-       if (current->signal->oom_score_adj == old_val)
-               current->signal->oom_score_adj = new_val;
-       trace_oom_score_adj_update(current);
-       spin_unlock_irq(&sighand->siglock);
-}
-
-/**
- * test_set_oom_score_adj() - set current's oom_score_adj and return old value
- * @new_val: new oom_score_adj value
- *
- * Sets the oom_score_adj value for current to @new_val with proper
- * synchronization and returns the old value.  Usually used to temporarily
- * set a value, save the old value in the caller, and then reinstate it later.
- */
-int test_set_oom_score_adj(int new_val)
-{
-       struct sighand_struct *sighand = current->sighand;
-       int old_val;
-
-       spin_lock_irq(&sighand->siglock);
-       old_val = current->signal->oom_score_adj;
-       current->signal->oom_score_adj = new_val;
-       trace_oom_score_adj_update(current);
-       spin_unlock_irq(&sighand->siglock);
-
-       return old_val;
-}
-
 #ifdef CONFIG_NUMA
 /**
  * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
 
-       adj = p->signal->oom_score_adj;
+       adj = (long)p->signal->oom_score_adj;
        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
        if (!task->mm)
                return OOM_SCAN_CONTINUE;
 
-       if (task->flags & PF_EXITING) {
+       /*
+        * If task is allocating a lot of memory and has been marked to be
+        * killed first if it triggers an oom, then select it.
+        */
+       if (oom_task_origin(task))
+               return OOM_SCAN_SELECT;
+
+       if (task->flags & PF_EXITING && !force_kill) {
                /*
-                * If task is current and is in the process of releasing memory,
-                * allow the "kill" to set TIF_MEMDIE, which will allow it to
-                * access memory reserves.  Otherwise, it may stall forever.
-                *
-                * The iteration isn't broken here, however, in case other
-                * threads are found to have already been oom killed.
+                * If this task is not being ptraced on exit, then wait for it
+                * to finish before killing some other task unnecessarily.
                 */
-               if (task == current)
-                       return OOM_SCAN_SELECT;
-               else if (!force_kill) {
-                       /*
-                        * If this task is not being ptraced on exit, then wait
-                        * for it to finish before killing some other task
-                        * unnecessarily.
-                        */
-                       if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-                               return OOM_SCAN_ABORT;
-               }
+               if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+                       return OOM_SCAN_ABORT;
        }
        return OOM_SCAN_OK;
 }
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
 
-               pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
+               pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
                        task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-               "oom_score_adj=%d\n",
+               "oom_score_adj=%hd\n",
                current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
@@ -706,11 +658,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                return;
 
        /*
-        * If current has a pending SIGKILL, then automatically select it.  The
-        * goal is to allow it to allocate so that it may quickly exit and free
-        * its memory.
+        * If current has a pending SIGKILL or is exiting, then automatically
+        * select it.  The goal is to allow it to allocate so that it may
+        * quickly exit and free its memory.
         */
-       if (fatal_signal_pending(current)) {
+       if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
                set_thread_flag(TIF_MEMDIE);
                return;
        }
index 830893b..6f42712 100644 (file)
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 }
 
 /*
- * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
  * will look to see if it needs to start dirty throttling.
  *
  * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
 DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 
 /**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
  * @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
-                                       unsigned long nr_pages_dirtied)
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ratelimit;
@@ -1