mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 enum zone_type policy_zone = 0;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 static const struct mempolicy_operations {
 113         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 114         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 115 } mpol_ops[MPOL_MAX];
 116
 117 /* Check that the nodemask contains at least one populated zone */
 118 static int is_valid_nodemask(const nodemask_t *nodemask)
 119 {
 120         int nd, k;
 121
 122         /* Check that there is something useful in this mask */
 123         k = policy_zone;
 124
 125         for_each_node_mask(nd, *nodemask) {
 126                 struct zone *z;
 127
 128                 for (k = 0; k <= policy_zone; k++) {
 129                         z = &NODE_DATA(nd)->node_zones[k];
 130                         if (z->present_pages > 0)
 131                                 return 1;
 132                 }
 133         }
 134
 135         return 0;
 136 }
 137
 138 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 139 {
 140         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 141 }
 142
 143 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 144                                    const nodemask_t *rel)
 145 {
 146         nodemask_t tmp;
 147         nodes_fold(tmp, *orig, nodes_weight(*rel));
 148         nodes_onto(*ret, tmp, *rel);
 149 }
 150
 151 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 152 {
 153         if (nodes_empty(*nodes))
 154                 return -EINVAL;
 155         pol->v.nodes = *nodes;
 156         return 0;
 157 }
 158
 159 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 160 {
 161         if (!nodes)
 162                 pol->v.preferred_node = -1;     /* local allocation */
 163         else if (nodes_empty(*nodes))
 164                 return -EINVAL;                 /*  no allowed nodes */
 165         else
 166                 pol->v.preferred_node = first_node(*nodes);
 167         return 0;
 168 }
 169
 170 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 171 {
 172         if (!is_valid_nodemask(nodes))
 173                 return -EINVAL;
 174         pol->v.nodes = *nodes;
 175         return 0;
 176 }
 177
 178 /* Create a new policy */
 179 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 180                                   nodemask_t *nodes)
 181 {
 182         struct mempolicy *policy;
 183         nodemask_t cpuset_context_nmask;
 184         int ret;
 185
 186         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 187                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 188
 189         if (mode == MPOL_DEFAULT) {
 190                 if (nodes && !nodes_empty(*nodes))
 191                         return ERR_PTR(-EINVAL);
 192                 return NULL;
 193         }
 194         VM_BUG_ON(!nodes);
 195
 196         /*
 197          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 198          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 199          * All other modes require a valid pointer to a non-empty nodemask.
 200          */
 201         if (mode == MPOL_PREFERRED) {
 202                 if (nodes_empty(*nodes)) {
 203                         if (((flags & MPOL_F_STATIC_NODES) ||
 204                              (flags & MPOL_F_RELATIVE_NODES)))
 205                                 return ERR_PTR(-EINVAL);
 206                         nodes = NULL;   /* flag local alloc */
 207                 }
 208         } else if (nodes_empty(*nodes))
 209                 return ERR_PTR(-EINVAL);
 210         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 211         if (!policy)
 212                 return ERR_PTR(-ENOMEM);
 213         atomic_set(&policy->refcnt, 1);
 214         policy->policy = mode;
 215         policy->flags = flags;
 216
 217         if (nodes) {
 218                 /*
 219                  * cpuset related setup doesn't apply to local allocation
 220                  */
 221                 cpuset_update_task_memory_state();
 222                 if (flags & MPOL_F_RELATIVE_NODES)
 223                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 224                                                &cpuset_current_mems_allowed);
 225                 else
 226                         nodes_and(cpuset_context_nmask, *nodes,
 227                                   cpuset_current_mems_allowed);
 228                 if (mpol_store_user_nodemask(policy))
 229                         policy->w.user_nodemask = *nodes;
 230                 else
 231                         policy->w.cpuset_mems_allowed =
 232                                                 cpuset_mems_allowed(current);
 233         }
 234
 235         ret = mpol_ops[mode].create(policy,
 236                                 nodes ? &cpuset_context_nmask : NULL);
 237         if (ret < 0) {
 238                 kmem_cache_free(policy_cache, policy);
 239                 return ERR_PTR(ret);
 240         }
 241         return policy;
 242 }
 243
 244 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 245 {
 246 }
 247
 248 static void mpol_rebind_nodemask(struct mempolicy *pol,
 249                                  const nodemask_t *nodes)
 250 {
 251         nodemask_t tmp;
 252
 253         if (pol->flags & MPOL_F_STATIC_NODES)
 254                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 255         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 256                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 257         else {
 258                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 259                             *nodes);
 260                 pol->w.cpuset_mems_allowed = *nodes;
 261         }
 262
 263         pol->v.nodes = tmp;
 264         if (!node_isset(current->il_next, tmp)) {
 265                 current->il_next = next_node(current->il_next, tmp);
 266                 if (current->il_next >= MAX_NUMNODES)
 267                         current->il_next = first_node(tmp);
 268                 if (current->il_next >= MAX_NUMNODES)
 269                         current->il_next = numa_node_id();
 270         }
 271 }
 272
 273 static void mpol_rebind_preferred(struct mempolicy *pol,
 274                                   const nodemask_t *nodes)
 275 {
 276         nodemask_t tmp;
 277
 278         if (pol->flags & MPOL_F_STATIC_NODES) {
 279                 int node = first_node(pol->w.user_nodemask);
 280
 281                 if (node_isset(node, *nodes))
 282                         pol->v.preferred_node = node;
 283                 else
 284                         pol->v.preferred_node = -1;
 285         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 286                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 287                 pol->v.preferred_node = first_node(tmp);
 288         } else if (pol->v.preferred_node != -1) {
 289                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 290                                                    pol->w.cpuset_mems_allowed,
 291                                                    *nodes);
 292                 pol->w.cpuset_mems_allowed = *nodes;
 293         }
 294 }
 295
 296 /* Migrate a policy to a different set of nodes */
 297 static void mpol_rebind_policy(struct mempolicy *pol,
 298                                const nodemask_t *newmask)
 299 {
 300         if (!pol)
 301                 return;
 302         if (!mpol_store_user_nodemask(pol) &&
 303             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 304                 return;
 305         mpol_ops[pol->policy].rebind(pol, newmask);
 306 }
 307
 308 /*
 309  * Wrapper for mpol_rebind_policy() that just requires task
 310  * pointer, and updates task mempolicy.
 311  */
 312
 313 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 314 {
 315         mpol_rebind_policy(tsk->mempolicy, new);
 316 }
 317
 318 /*
 319  * Rebind each vma in mm to new nodemask.
 320  *
 321  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 322  */
 323
 324 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 325 {
 326         struct vm_area_struct *vma;
 327
 328         down_write(&mm->mmap_sem);
 329         for (vma = mm->mmap; vma; vma = vma->vm_next)
 330                 mpol_rebind_policy(vma->vm_policy, new);
 331         up_write(&mm->mmap_sem);
 332 }
 333
 334 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 335         [MPOL_DEFAULT] = {
 336                 .rebind = mpol_rebind_default,
 337         },
 338         [MPOL_INTERLEAVE] = {
 339                 .create = mpol_new_interleave,
 340                 .rebind = mpol_rebind_nodemask,
 341         },
 342         [MPOL_PREFERRED] = {
 343                 .create = mpol_new_preferred,
 344                 .rebind = mpol_rebind_preferred,
 345         },
 346         [MPOL_BIND] = {
 347                 .create = mpol_new_bind,
 348                 .rebind = mpol_rebind_nodemask,
 349         },
 350 };
 351
 352 static void gather_stats(struct page *, void *, int pte_dirty);
 353 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 354                                 unsigned long flags);
 355
 356 /* Scan through pages checking if pages follow certain conditions. */
 357 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 358                 unsigned long addr, unsigned long end,
 359                 const nodemask_t *nodes, unsigned long flags,
 360                 void *private)
 361 {
 362         pte_t *orig_pte;
 363         pte_t *pte;
 364         spinlock_t *ptl;
 365
 366         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 367         do {
 368                 struct page *page;
 369                 int nid;
 370
 371                 if (!pte_present(*pte))
 372                         continue;
 373                 page = vm_normal_page(vma, addr, *pte);
 374                 if (!page)
 375                         continue;
 376                 /*
 377                  * The check for PageReserved here is important to avoid
 378                  * handling zero pages and other pages that may have been
 379                  * marked special by the system.
 380                  *
 381                  * If the PageReserved would not be checked here then f.e.
 382                  * the location of the zero page could have an influence
 383                  * on MPOL_MF_STRICT, zero pages would be counted for
 384                  * the per node stats, and there would be useless attempts
 385                  * to put zero pages on the migration list.
 386                  */
 387                 if (PageReserved(page))
 388                         continue;
 389                 nid = page_to_nid(page);
 390                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 391                         continue;
 392
 393                 if (flags & MPOL_MF_STATS)
 394                         gather_stats(page, private, pte_dirty(*pte));
 395                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 396                         migrate_page_add(page, private, flags);
 397                 else
 398                         break;
 399         } while (pte++, addr += PAGE_SIZE, addr != end);
 400         pte_unmap_unlock(orig_pte, ptl);
 401         return addr != end;
 402 }
 403
 404 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 405                 unsigned long addr, unsigned long end,
 406                 const nodemask_t *nodes, unsigned long flags,
 407                 void *private)
 408 {
 409         pmd_t *pmd;
 410         unsigned long next;
 411
 412         pmd = pmd_offset(pud, addr);
 413         do {
 414                 next = pmd_addr_end(addr, end);
 415                 if (pmd_none_or_clear_bad(pmd))
 416                         continue;
 417                 if (check_pte_range(vma, pmd, addr, next, nodes,
 418                                     flags, private))
 419                         return -EIO;
 420         } while (pmd++, addr = next, addr != end);
 421         return 0;
 422 }
 423
 424 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 425                 unsigned long addr, unsigned long end,
 426                 const nodemask_t *nodes, unsigned long flags,
 427                 void *private)
 428 {
 429         pud_t *pud;
 430         unsigned long next;
 431
 432         pud = pud_offset(pgd, addr);
 433         do {
 434                 next = pud_addr_end(addr, end);
 435                 if (pud_none_or_clear_bad(pud))
 436                         continue;
 437                 if (check_pmd_range(vma, pud, addr, next, nodes,
 438                                     flags, private))
 439                         return -EIO;
 440         } while (pud++, addr = next, addr != end);
 441         return 0;
 442 }
 443
 444 static inline int check_pgd_range(struct vm_area_struct *vma,
 445                 unsigned long addr, unsigned long end,
 446                 const nodemask_t *nodes, unsigned long flags,
 447                 void *private)
 448 {
 449         pgd_t *pgd;
 450         unsigned long next;
 451
 452         pgd = pgd_offset(vma->vm_mm, addr);
 453         do {
 454                 next = pgd_addr_end(addr, end);
 455                 if (pgd_none_or_clear_bad(pgd))
 456                         continue;
 457                 if (check_pud_range(vma, pgd, addr, next, nodes,
 458                                     flags, private))
 459                         return -EIO;
 460         } while (pgd++, addr = next, addr != end);
 461         return 0;
 462 }
 463
 464 /*
 465  * Check if all pages in a range are on a set of nodes.
 466  * If pagelist != NULL then isolate pages from the LRU and
 467  * put them on the pagelist.
 468  */
 469 static struct vm_area_struct *
 470 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 471                 const nodemask_t *nodes, unsigned long flags, void *private)
 472 {
 473         int err;
 474         struct vm_area_struct *first, *vma, *prev;
 475
 476         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 477
 478                 err = migrate_prep();
 479                 if (err)
 480                         return ERR_PTR(err);
 481         }
 482
 483         first = find_vma(mm, start);
 484         if (!first)
 485                 return ERR_PTR(-EFAULT);
 486         prev = NULL;
 487         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 488                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 489                         if (!vma->vm_next && vma->vm_end < end)
 490                                 return ERR_PTR(-EFAULT);
 491                         if (prev && prev->vm_end < vma->vm_start)
 492                                 return ERR_PTR(-EFAULT);
 493                 }
 494                 if (!is_vm_hugetlb_page(vma) &&
 495                     ((flags & MPOL_MF_STRICT) ||
 496                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 497                                 vma_migratable(vma)))) {
 498                         unsigned long endvma = vma->vm_end;
 499
 500                         if (endvma > end)
 501                                 endvma = end;
 502                         if (vma->vm_start > start)
 503                                 start = vma->vm_start;
 504                         err = check_pgd_range(vma, start, endvma, nodes,
 505                                                 flags, private);
 506                         if (err) {
 507                                 first = ERR_PTR(err);
 508                                 break;
 509                         }
 510                 }
 511                 prev = vma;
 512         }
 513         return first;
 514 }
 515
 516 /* Apply policy to a single VMA */
 517 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 518 {
 519         int err = 0;
 520         struct mempolicy *old = vma->vm_policy;
 521
 522         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 523                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 524                  vma->vm_ops, vma->vm_file,
 525                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 526
 527         if (vma->vm_ops && vma->vm_ops->set_policy)
 528                 err = vma->vm_ops->set_policy(vma, new);
 529         if (!err) {
 530                 mpol_get(new);
 531                 vma->vm_policy = new;
 532                 mpol_put(old);
 533         }
 534         return err;
 535 }
 536
 537 /* Step 2: apply policy to a range and do splits. */
 538 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 539                        unsigned long end, struct mempolicy *new)
 540 {
 541         struct vm_area_struct *next;
 542         int err;
 543
 544         err = 0;
 545         for (; vma && vma->vm_start < end; vma = next) {
 546                 next = vma->vm_next;
 547                 if (vma->vm_start < start)
 548                         err = split_vma(vma->vm_mm, vma, start, 1);
 549                 if (!err && vma->vm_end > end)
 550                         err = split_vma(vma->vm_mm, vma, end, 0);
 551                 if (!err)
 552                         err = policy_vma(vma, new);
 553                 if (err)
 554                         break;
 555         }
 556         return err;
 557 }
 558
 559 /*
 560  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 561  * mempolicy.  Allows more rapid checking of this (combined perhaps
 562  * with other PF_* flag bits) on memory allocation hot code paths.
 563  *
 564  * If called from outside this file, the task 'p' should -only- be
 565  * a newly forked child not yet visible on the task list, because
 566  * manipulating the task flags of a visible task is not safe.
 567  *
 568  * The above limitation is why this routine has the funny name
 569  * mpol_fix_fork_child_flag().
 570  *
 571  * It is also safe to call this with a task pointer of current,
 572  * which the static wrapper mpol_set_task_struct_flag() does,
 573  * for use within this file.
 574  */
 575
 576 void mpol_fix_fork_child_flag(struct task_struct *p)
 577 {
 578         if (p->mempolicy)
 579                 p->flags |= PF_MEMPOLICY;
 580         else
 581                 p->flags &= ~PF_MEMPOLICY;
 582 }
 583
 584 static void mpol_set_task_struct_flag(void)
 585 {
 586         mpol_fix_fork_child_flag(current);
 587 }
 588
 589 /* Set the process memory policy */
 590 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 591                              nodemask_t *nodes)
 592 {
 593         struct mempolicy *new;
 594         struct mm_struct *mm = current->mm;
 595
 596         new = mpol_new(mode, flags, nodes);
 597         if (IS_ERR(new))
 598                 return PTR_ERR(new);
 599
 600         /*
 601          * prevent changing our mempolicy while show_numa_maps()
 602          * is using it.
 603          * Note:  do_set_mempolicy() can be called at init time
 604          * with no 'mm'.
 605          */
 606         if (mm)
 607                 down_write(&mm->mmap_sem);
 608         mpol_put(current->mempolicy);
 609         current->mempolicy = new;
 610         mpol_set_task_struct_flag();
 611         if (new && new->policy == MPOL_INTERLEAVE &&
 612             nodes_weight(new->v.nodes))
 613                 current->il_next = first_node(new->v.nodes);
 614         if (mm)
 615                 up_write(&mm->mmap_sem);
 616
 617         return 0;
 618 }
 619
 620 /* Fill a zone bitmap for a policy */
 621 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 622 {
 623         nodes_clear(*nodes);
 624         switch (p->policy) {
 625         case MPOL_DEFAULT:
 626                 break;
 627         case MPOL_BIND:
 628                 /* Fall through */
 629         case MPOL_INTERLEAVE:
 630                 *nodes = p->v.nodes;
 631                 break;
 632         case MPOL_PREFERRED:
 633                 /* or use current node instead of memory_map? */
 634                 if (p->v.preferred_node < 0)
 635                         *nodes = node_states[N_HIGH_MEMORY];
 636                 else
 637                         node_set(p->v.preferred_node, *nodes);
 638                 break;
 639         default:
 640                 BUG();
 641         }
 642 }
 643
 644 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 645 {
 646         struct page *p;
 647         int err;
 648
 649         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 650         if (err >= 0) {
 651                 err = page_to_nid(p);
 652                 put_page(p);
 653         }
 654         return err;
 655 }
 656
 657 /* Retrieve NUMA policy */
 658 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 659                              unsigned long addr, unsigned long flags)
 660 {
 661         int err;
 662         struct mm_struct *mm = current->mm;
 663         struct vm_area_struct *vma = NULL;
 664         struct mempolicy *pol = current->mempolicy;
 665
 666         cpuset_update_task_memory_state();
 667         if (flags &
 668                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 669                 return -EINVAL;
 670
 671         if (flags & MPOL_F_MEMS_ALLOWED) {
 672                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 673                         return -EINVAL;
 674                 *policy = 0;    /* just so it's initialized */
 675                 *nmask  = cpuset_current_mems_allowed;
 676                 return 0;
 677         }
 678
 679         if (flags & MPOL_F_ADDR) {
 680                 down_read(&mm->mmap_sem);
 681                 vma = find_vma_intersection(mm, addr, addr+1);
 682                 if (!vma) {
 683                         up_read(&mm->mmap_sem);
 684                         return -EFAULT;
 685                 }
 686                 if (vma->vm_ops && vma->vm_ops->get_policy)
 687                         pol = vma->vm_ops->get_policy(vma, addr);
 688                 else
 689                         pol = vma->vm_policy;
 690         } else if (addr)
 691                 return -EINVAL;
 692
 693         if (!pol)
 694                 pol = &default_policy;
 695
 696         if (flags & MPOL_F_NODE) {
 697                 if (flags & MPOL_F_ADDR) {
 698                         err = lookup_node(mm, addr);
 699                         if (err < 0)
 700                                 goto out;
 701                         *policy = err;
 702                 } else if (pol == current->mempolicy &&
 703                                 pol->policy == MPOL_INTERLEAVE) {
 704                         *policy = current->il_next;
 705                 } else {
 706                         err = -EINVAL;
 707                         goto out;
 708                 }
 709         } else
 710                 *policy = pol->policy | pol->flags;
 711
 712         if (vma) {
 713                 up_read(&current->mm->mmap_sem);
 714                 vma = NULL;
 715         }
 716
 717         err = 0;
 718         if (nmask)
 719                 get_zonemask(pol, nmask);
 720
 721  out:
 722         if (vma)
 723                 up_read(&current->mm->mmap_sem);
 724         return err;
 725 }
 726
 727 #ifdef CONFIG_MIGRATION
 728 /*
 729  * page migration
 730  */
 731 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 732                                 unsigned long flags)
 733 {
 734         /*
 735          * Avoid migrating a page that is shared with others.
 736          */
 737         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 738                 isolate_lru_page(page, pagelist);
 739 }
 740
 741 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 742 {
 743         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 744 }
 745
 746 /*
 747  * Migrate pages from one node to a target node.
 748  * Returns error or the number of pages not migrated.
 749  */
 750 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 751                            int flags)
 752 {
 753         nodemask_t nmask;
 754         LIST_HEAD(pagelist);
 755         int err = 0;
 756
 757         nodes_clear(nmask);
 758         node_set(source, nmask);
 759
 760         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 761                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 762
 763         if (!list_empty(&pagelist))
 764                 err = migrate_pages(&pagelist, new_node_page, dest);
 765
 766         return err;
 767 }
 768
 769 /*
 770  * Move pages between the two nodesets so as to preserve the physical
 771  * layout as much as possible.
 772  *
 773  * Returns the number of page that could not be moved.
 774  */
 775 int do_migrate_pages(struct mm_struct *mm,
 776         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 777 {
 778         LIST_HEAD(pagelist);
 779         int busy = 0;
 780         int err = 0;
 781         nodemask_t tmp;
 782
 783         down_read(&mm->mmap_sem);
 784
 785         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 786         if (err)
 787                 goto out;
 788
 789 /*
 790  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 791  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 792  * bit in 'tmp', and return that <source, dest> pair for migration.
 793  * The pair of nodemasks 'to' and 'from' define the map.
 794  *
 795  * If no pair of bits is found that way, fallback to picking some
 796  * pair of 'source' and 'dest' bits that are not the same.  If the
 797  * 'source' and 'dest' bits are the same, this represents a node
 798  * that will be migrating to itself, so no pages need move.
 799  *
 800  * If no bits are left in 'tmp', or if all remaining bits left
 801  * in 'tmp' correspond to the same bit in 'to', return false
 802  * (nothing left to migrate).
 803  *
 804  * This lets us pick a pair of nodes to migrate between, such that
 805  * if possible the dest node is not already occupied by some other
 806  * source node, minimizing the risk of overloading the memory on a
 807  * node that would happen if we migrated incoming memory to a node
 808  * before migrating outgoing memory source that same node.
 809  *
 810  * A single scan of tmp is sufficient.  As we go, we remember the
 811  * most recent <s, d> pair that moved (s != d).  If we find a pair
 812  * that not only moved, but what's better, moved to an empty slot
 813  * (d is not set in tmp), then we break out then, with that pair.
 814  * Otherwise when we finish scannng from_tmp, we at least have the
 815  * most recent <s, d> pair that moved.  If we get all the way through
 816  * the scan of tmp without finding any node that moved, much less
 817  * moved to an empty node, then there is nothing left worth migrating.
 818  */
 819
 820         tmp = *from_nodes;
 821         while (!nodes_empty(tmp)) {
 822                 int s,d;
 823                 int source = -1;
 824                 int dest = 0;
 825
 826                 for_each_node_mask(s, tmp) {
 827                         d = node_remap(s, *from_nodes, *to_nodes);
 828                         if (s == d)
 829                                 continue;
 830
 831                         source = s;     /* Node moved. Memorize */
 832                         dest = d;
 833
 834                         /* dest not in remaining from nodes? */
 835                         if (!node_isset(dest, tmp))
 836                                 break;
 837                 }
 838                 if (source == -1)
 839                         break;
 840
 841                 node_clear(source, tmp);
 842                 err = migrate_to_node(mm, source, dest, flags);
 843                 if (err > 0)
 844                         busy += err;
 845                 if (err < 0)
 846                         break;
 847         }
 848 out:
 849         up_read(&mm->mmap_sem);
 850         if (err < 0)
 851                 return err;
 852         return busy;
 853
 854 }
 855
 856 /*
 857  * Allocate a new page for page migration based on vma policy.
 858  * Start assuming that page is mapped by vma pointed to by @private.
 859  * Search forward from there, if not.  N.B., this assumes that the
 860  * list of pages handed to migrate_pages()--which is how we get here--
 861  * is in virtual address order.
 862  */
 863 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 864 {
 865         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 866         unsigned long uninitialized_var(address);
 867
 868         while (vma) {
 869                 address = page_address_in_vma(page, vma);
 870                 if (address != -EFAULT)
 871                         break;
 872                 vma = vma->vm_next;
 873         }
 874
 875         /*
 876          * if !vma, alloc_page_vma() will use task or system default policy
 877          */
 878         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 879 }
 880 #else
 881
 882 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 883                                 unsigned long flags)
 884 {
 885 }
 886
 887 int do_migrate_pages(struct mm_struct *mm,
 888         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 889 {
 890         return -ENOSYS;
 891 }
 892
 893 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 894 {
 895         return NULL;
 896 }
 897 #endif
 898
 899 static long do_mbind(unsigned long start, unsigned long len,
 900                      unsigned short mode, unsigned short mode_flags,
 901                      nodemask_t *nmask, unsigned long flags)
 902 {
 903         struct vm_area_struct *vma;
 904         struct mm_struct *mm = current->mm;
 905         struct mempolicy *new;
 906         unsigned long end;
 907         int err;
 908         LIST_HEAD(pagelist);
 909
 910         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 911                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 912                 return -EINVAL;
 913         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 914                 return -EPERM;
 915
 916         if (start & ~PAGE_MASK)
 917                 return -EINVAL;
 918
 919         if (mode == MPOL_DEFAULT)
 920                 flags &= ~MPOL_MF_STRICT;
 921
 922         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 923         end = start + len;
 924
 925         if (end < start)
 926                 return -EINVAL;
 927         if (end == start)
 928                 return 0;
 929
 930         new = mpol_new(mode, mode_flags, nmask);
 931         if (IS_ERR(new))
 932                 return PTR_ERR(new);
 933
 934         /*
 935          * If we are using the default policy then operation
 936          * on discontinuous address spaces is okay after all
 937          */
 938         if (!new)
 939                 flags |= MPOL_MF_DISCONTIG_OK;
 940
 941         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 942                  start, start + len, mode, mode_flags,
 943                  nmask ? nodes_addr(*nmask)[0] : -1);
 944
 945         down_write(&mm->mmap_sem);
 946         vma = check_range(mm, start, end, nmask,
 947                           flags | MPOL_MF_INVERT, &pagelist);
 948
 949         err = PTR_ERR(vma);
 950         if (!IS_ERR(vma)) {
 951                 int nr_failed = 0;
 952
 953                 err = mbind_range(vma, start, end, new);
 954
 955                 if (!list_empty(&pagelist))
 956                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 957                                                 (unsigned long)vma);
 958
 959                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 960                         err = -EIO;
 961         }
 962
 963         up_write(&mm->mmap_sem);
 964         mpol_put(new);
 965         return err;
 966 }
 967
 968 /*
 969  * User space interface with variable sized bitmaps for nodelists.
 970  */
 971
 972 /* Copy a node mask from user space. */
 973 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 974                      unsigned long maxnode)
 975 {
 976         unsigned long k;
 977         unsigned long nlongs;
 978         unsigned long endmask;
 979
 980         --maxnode;
 981         nodes_clear(*nodes);
 982         if (maxnode == 0 || !nmask)
 983                 return 0;
 984         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 985                 return -EINVAL;
 986
 987         nlongs = BITS_TO_LONGS(maxnode);
 988         if ((maxnode % BITS_PER_LONG) == 0)
 989                 endmask = ~0UL;
 990         else
 991                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 992
 993         /* When the user specified more nodes than supported just check
 994            if the non supported part is all zero. */
 995         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 996                 if (nlongs > PAGE_SIZE/sizeof(long))
 997                         return -EINVAL;
 998                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 999                         unsigned long t;
1000                         if (get_user(t, nmask + k))
1001                                 return -EFAULT;
1002                         if (k == nlongs - 1) {
1003                                 if (t & endmask)
1004                                         return -EINVAL;
1005                         } else if (t)
1006                                 return -EINVAL;
1007                 }
1008                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1009                 endmask = ~0UL;
1010         }
1011
1012         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1013                 return -EFAULT;
1014         nodes_addr(*nodes)[nlongs-1] &= endmask;
1015         return 0;
1016 }
1017
1018 /* Copy a kernel node mask to user space */
1019 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1020                               nodemask_t *nodes)
1021 {
1022         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1023         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1024
1025         if (copy > nbytes) {
1026                 if (copy > PAGE_SIZE)
1027                         return -EINVAL;
1028                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1029                         return -EFAULT;
1030                 copy = nbytes;
1031         }
1032         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1033 }
1034
1035 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1036                         unsigned long mode,
1037                         unsigned long __user *nmask, unsigned long maxnode,
1038                         unsigned flags)
1039 {
1040         nodemask_t nodes;
1041         int err;
1042         unsigned short mode_flags;
1043
1044         mode_flags = mode & MPOL_MODE_FLAGS;
1045         mode &= ~MPOL_MODE_FLAGS;
1046         if (mode >= MPOL_MAX)
1047                 return -EINVAL;
1048         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1049             (mode_flags & MPOL_F_RELATIVE_NODES))
1050                 return -EINVAL;
1051         err = get_nodes(&nodes, nmask, maxnode);
1052         if (err)
1053                 return err;
1054         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1055 }
1056
1057 /* Set the process memory policy */
1058 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1059                 unsigned long maxnode)
1060 {
1061         int err;
1062         nodemask_t nodes;
1063         unsigned short flags;
1064
1065         flags = mode & MPOL_MODE_FLAGS;
1066         mode &= ~MPOL_MODE_FLAGS;
1067         if ((unsigned int)mode >= MPOL_MAX)
1068                 return -EINVAL;
1069         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1070                 return -EINVAL;
1071         err = get_nodes(&nodes, nmask, maxnode);
1072         if (err)
1073                 return err;
1074         return do_set_mempolicy(mode, flags, &nodes);
1075 }
1076
1077 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1078                 const unsigned long __user *old_nodes,
1079                 const unsigned long __user *new_nodes)
1080 {
1081         struct mm_struct *mm;
1082         struct task_struct *task;
1083         nodemask_t old;
1084         nodemask_t new;
1085         nodemask_t task_nodes;
1086         int err;
1087
1088         err = get_nodes(&old, old_nodes, maxnode);
1089         if (err)
1090                 return err;
1091
1092         err = get_nodes(&new, new_nodes, maxnode);
1093         if (err)
1094                 return err;
1095
1096         /* Find the mm_struct */
1097         read_lock(&tasklist_lock);
1098         task = pid ? find_task_by_vpid(pid) : current;
1099         if (!task) {
1100                 read_unlock(&tasklist_lock);
1101                 return -ESRCH;
1102         }
1103         mm = get_task_mm(task);
1104         read_unlock(&tasklist_lock);
1105
1106         if (!mm)
1107                 return -EINVAL;
1108
1109         /*
1110          * Check if this process has the right to modify the specified
1111          * process. The right exists if the process has administrative
1112          * capabilities, superuser privileges or the same
1113          * userid as the target process.
1114          */
1115         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1116             (current->uid != task->suid) && (current->uid != task->uid) &&
1117             !capable(CAP_SYS_NICE)) {
1118                 err = -EPERM;
1119                 goto out;
1120         }
1121
1122         task_nodes = cpuset_mems_allowed(task);
1123         /* Is the user allowed to access the target nodes? */
1124         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1125                 err = -EPERM;
1126                 goto out;
1127         }
1128
1129         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1130                 err = -EINVAL;
1131                 goto out;
1132         }
1133
1134         err = security_task_movememory(task);
1135         if (err)
1136                 goto out;
1137
1138         err = do_migrate_pages(mm, &old, &new,
1139                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1140 out:
1141         mmput(mm);
1142         return err;
1143 }
1144
1145
1146 /* Retrieve NUMA policy */
1147 asmlinkage long sys_get_mempolicy(int __user *policy,
1148                                 unsigned long __user *nmask,
1149                                 unsigned long maxnode,
1150                                 unsigned long addr, unsigned long flags)
1151 {
1152         int err;
1153         int uninitialized_var(pval);
1154         nodemask_t nodes;
1155
1156         if (nmask != NULL && maxnode < MAX_NUMNODES)
1157                 return -EINVAL;
1158
1159         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1160
1161         if (err)
1162                 return err;
1163
1164         if (policy && put_user(pval, policy))
1165                 return -EFAULT;
1166
1167         if (nmask)
1168                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1169
1170         return err;
1171 }
1172
1173 #ifdef CONFIG_COMPAT
1174
1175 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1176                                      compat_ulong_t __user *nmask,
1177                                      compat_ulong_t maxnode,
1178                                      compat_ulong_t addr, compat_ulong_t flags)
1179 {
1180         long err;
1181         unsigned long __user *nm = NULL;
1182         unsigned long nr_bits, alloc_size;
1183         DECLARE_BITMAP(bm, MAX_NUMNODES);
1184
1185         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1186         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1187
1188         if (nmask)
1189                 nm = compat_alloc_user_space(alloc_size);
1190
1191         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1192
1193         if (!err && nmask) {
1194                 err = copy_from_user(bm, nm, alloc_size);
1195                 /* ensure entire bitmap is zeroed */
1196                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1197                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1198         }
1199
1200         return err;
1201 }
1202
1203 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1204                                      compat_ulong_t maxnode)
1205 {
1206         long err = 0;
1207         unsigned long __user *nm = NULL;
1208         unsigned long nr_bits, alloc_size;
1209         DECLARE_BITMAP(bm, MAX_NUMNODES);
1210
1211         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1212         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1213
1214         if (nmask) {
1215                 err = compat_get_bitmap(bm, nmask, nr_bits);
1216                 nm = compat_alloc_user_space(alloc_size);
1217                 err |= copy_to_user(nm, bm, alloc_size);
1218         }
1219
1220         if (err)
1221                 return -EFAULT;
1222
1223         return sys_set_mempolicy(mode, nm, nr_bits+1);
1224 }
1225
1226 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1227                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1228                              compat_ulong_t maxnode, compat_ulong_t flags)
1229 {
1230         long err = 0;
1231         unsigned long __user *nm = NULL;
1232         unsigned long nr_bits, alloc_size;
1233         nodemask_t bm;
1234
1235         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1236         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1237
1238         if (nmask) {
1239                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1240                 nm = compat_alloc_user_space(alloc_size);
1241                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1242         }
1243
1244         if (err)
1245                 return -EFAULT;
1246
1247         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1248 }
1249
1250 #endif
1251
1252 /*
1253  * get_vma_policy(@task, @vma, @addr)
1254  * @task - task for fallback if vma policy == default
1255  * @vma   - virtual memory area whose policy is sought
1256  * @addr  - address in @vma for shared policy lookup
1257  *
1258  * Returns effective policy for a VMA at specified address.
1259  * Falls back to @task or system default policy, as necessary.
1260  * Returned policy has extra reference count if shared, vma,
1261  * or some other task's policy [show_numa_maps() can pass
1262  * @task != current].  It is the caller's responsibility to
1263  * free the reference in these cases.
1264  */
1265 static struct mempolicy *get_vma_policy(struct task_struct *task,
1266                 struct vm_area_struct *vma, unsigned long addr)
1267 {
1268         struct mempolicy *pol = task->mempolicy;
1269         int shared_pol = 0;
1270
1271         if (vma) {
1272                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1273                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1274                                                                         addr);
1275                         if (vpol)
1276                                 pol = vpol;
1277                         shared_pol = 1; /* if pol non-NULL, add ref below */
1278                 } else if (vma->vm_policy &&
1279                                 vma->vm_policy->policy != MPOL_DEFAULT)
1280                         pol = vma->vm_policy;
1281         }
1282         if (!pol)
1283                 pol = &default_policy;
1284         else if (!shared_pol && pol != current->mempolicy)
1285                 mpol_get(pol);  /* vma or other task's policy */
1286         return pol;
1287 }
1288
1289 /* Return a nodemask representing a mempolicy */
1290 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1291 {
1292         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1293         if (unlikely(policy->policy == MPOL_BIND) &&
1294                         gfp_zone(gfp) >= policy_zone &&
1295                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1296                 return &policy->v.nodes;
1297
1298         return NULL;
1299 }
1300
1301 /* Return a zonelist representing a mempolicy */
1302 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1303 {
1304         int nd;
1305
1306         switch (policy->policy) {
1307         case MPOL_PREFERRED:
1308                 nd = policy->v.preferred_node;
1309                 if (nd < 0)
1310                         nd = numa_node_id();
1311                 break;
1312         case MPOL_BIND:
1313                 /*
1314                  * Normally, MPOL_BIND allocations node-local are node-local
1315                  * within the allowed nodemask. However, if __GFP_THISNODE is
1316                  * set and the current node is part of the mask, we use the
1317                  * the zonelist for the first node in the mask instead.
1318                  */
1319                 nd = numa_node_id();
1320                 if (unlikely(gfp & __GFP_THISNODE) &&
1321                                 unlikely(!node_isset(nd, policy->v.nodes)))
1322                         nd = first_node(policy->v.nodes);
1323                 break;
1324         case MPOL_INTERLEAVE: /* should not happen */
1325         case MPOL_DEFAULT:
1326                 nd = numa_node_id();
1327                 break;
1328         default:
1329                 nd = 0;
1330                 BUG();
1331         }
1332         return node_zonelist(nd, gfp);
1333 }
1334
1335 /* Do dynamic interleaving for a process */
1336 static unsigned interleave_nodes(struct mempolicy *policy)
1337 {
1338         unsigned nid, next;
1339         struct task_struct *me = current;
1340
1341         nid = me->il_next;
1342         next = next_node(nid, policy->v.nodes);
1343         if (next >= MAX_NUMNODES)
1344                 next = first_node(policy->v.nodes);
1345         if (next < MAX_NUMNODES)
1346                 me->il_next = next;
1347         return nid;
1348 }
1349
1350 /*
1351  * Depending on the memory policy provide a node from which to allocate the
1352  * next slab entry.
1353  */
1354 unsigned slab_node(struct mempolicy *policy)
1355 {
1356         unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1357
1358         switch (pol) {
1359         case MPOL_INTERLEAVE:
1360                 return interleave_nodes(policy);
1361
1362         case MPOL_BIND: {
1363                 /*
1364                  * Follow bind policy behavior and start allocation at the
1365                  * first node.
1366                  */
1367                 struct zonelist *zonelist;
1368                 struct zone *zone;
1369                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1370                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1371                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1372                                                         &policy->v.nodes,
1373                                                         &zone);
1374                 return zone->node;
1375         }
1376
1377         case MPOL_PREFERRED:
1378                 if (policy->v.preferred_node >= 0)
1379                         return policy->v.preferred_node;
1380                 /* Fall through */
1381
1382         default:
1383                 return numa_node_id();
1384         }
1385 }
1386
1387 /* Do static interleaving for a VMA with known offset. */
1388 static unsigned offset_il_node(struct mempolicy *pol,
1389                 struct vm_area_struct *vma, unsigned long off)
1390 {
1391         unsigned nnodes = nodes_weight(pol->v.nodes);
1392         unsigned target;
1393         int c;
1394         int nid = -1;
1395
1396         if (!nnodes)
1397                 return numa_node_id();
1398         target = (unsigned int)off % nnodes;
1399         c = 0;
1400         do {
1401                 nid = next_node(nid, pol->v.nodes);
1402                 c++;
1403         } while (c <= target);
1404         return nid;
1405 }
1406
1407 /* Determine a node number for interleave */
1408 static inline unsigned interleave_nid(struct mempolicy *pol,
1409                  struct vm_area_struct *vma, unsigned long addr, int shift)
1410 {
1411         if (vma) {
1412                 unsigned long off;
1413
1414                 /*
1415                  * for small pages, there is no difference between
1416                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1417                  * for huge pages, since vm_pgoff is in units of small
1418                  * pages, we need to shift off the always 0 bits to get
1419                  * a useful offset.
1420                  */
1421                 BUG_ON(shift < PAGE_SHIFT);
1422                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1423                 off += (addr - vma->vm_start) >> shift;
1424                 return offset_il_node(pol, vma, off);
1425         } else
1426                 return interleave_nodes(pol);
1427 }
1428
1429 #ifdef CONFIG_HUGETLBFS
1430 /*
1431  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1432  * @vma = virtual memory area whose policy is sought
1433  * @addr = address in @vma for shared policy lookup and interleave policy
1434  * @gfp_flags = for requested zone
1435  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1436  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1437  *
1438  * Returns a zonelist suitable for a huge page allocation.
1439  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1440  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1441  * If it is also a policy for which get_vma_policy() returns an extra
1442  * reference, we must hold that reference until after the allocation.
1443  * In that case, return policy via @mpol so hugetlb allocation can drop
1444  * the reference. For non-'BIND referenced policies, we can/do drop the
1445  * reference here, so the caller doesn't need to know about the special case
1446  * for default and current task policy.
1447  */
1448 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1449                                 gfp_t gfp_flags, struct mempolicy **mpol,
1450                                 nodemask_t **nodemask)
1451 {
1452         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1453         struct zonelist *zl;
1454
1455         *mpol = NULL;           /* probably no unref needed */
1456         *nodemask = NULL;       /* assume !MPOL_BIND */
1457         if (pol->policy == MPOL_BIND) {
1458                         *nodemask = &pol->v.nodes;
1459         } else if (pol->policy == MPOL_INTERLEAVE) {
1460                 unsigned nid;
1461
1462                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1463                 if (unlikely(pol != &default_policy &&
1464                                 pol != current->mempolicy))
1465                         __mpol_put(pol);        /* finished with pol */
1466                 return node_zonelist(nid, gfp_flags);
1467         }
1468
1469         zl = zonelist_policy(GFP_HIGHUSER, pol);
1470         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1471                 if (pol->policy != MPOL_BIND)
1472                         __mpol_put(pol);        /* finished with pol */
1473                 else
1474                         *mpol = pol;    /* unref needed after allocation */
1475         }
1476         return zl;
1477 }
1478 #endif
1479
1480 /* Allocate a page in interleaved policy.
1481    Own path because it needs to do special accounting. */
1482 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1483                                         unsigned nid)
1484 {
1485         struct zonelist *zl;
1486         struct page *page;
1487
1488         zl = node_zonelist(nid, gfp);
1489         page = __alloc_pages(gfp, order, zl);
1490         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1491                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1492         return page;
1493 }
1494
1495 /**
1496  *      alloc_page_vma  - Allocate a page for a VMA.
1497  *
1498  *      @gfp:
1499  *      %GFP_USER    user allocation.
1500  *      %GFP_KERNEL  kernel allocations,
1501  *      %GFP_HIGHMEM highmem/user allocations,
1502  *      %GFP_FS      allocation should not call back into a file system.
1503  *      %GFP_ATOMIC  don't sleep.
1504  *
1505  *      @vma:  Pointer to VMA or NULL if not available.
1506  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1507  *
1508  *      This function allocates a page from the kernel page pool and applies
1509  *      a NUMA policy associated with the VMA or the current process.
1510  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1511  *      mm_struct of the VMA to prevent it from going away. Should be used for
1512  *      all allocations for pages that will be mapped into
1513  *      user space. Returns NULL when no page can be allocated.
1514  *
1515  *      Should be called with the mm_sem of the vma hold.
1516  */
1517 struct page *
1518 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1519 {
1520         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1521         struct zonelist *zl;
1522
1523         cpuset_update_task_memory_state();
1524
1525         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1526                 unsigned nid;
1527
1528                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1529                 if (unlikely(pol != &default_policy &&
1530                                 pol != current->mempolicy))
1531                         __mpol_put(pol);        /* finished with pol */
1532                 return alloc_page_interleave(gfp, 0, nid);
1533         }
1534         zl = zonelist_policy(gfp, pol);
1535         if (pol != &default_policy && pol != current->mempolicy) {
1536                 /*
1537                  * slow path: ref counted policy -- shared or vma
1538                  */
1539                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1540                                                 zl, nodemask_policy(gfp, pol));
1541                 __mpol_put(pol);
1542                 return page;
1543         }
1544         /*
1545          * fast path:  default or task policy
1546          */
1547         return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1548 }
1549
1550 /**
1551  *      alloc_pages_current - Allocate pages.
1552  *
1553  *      @gfp:
1554  *              %GFP_USER   user allocation,
1555  *              %GFP_KERNEL kernel allocation,
1556  *              %GFP_HIGHMEM highmem allocation,
1557  *              %GFP_FS     don't call back into a file system.
1558  *              %GFP_ATOMIC don't sleep.
1559  *      @order: Power of two of allocation size in pages. 0 is a single page.
1560  *
1561  *      Allocate a page from the kernel page pool.  When not in
1562  *      interrupt context and apply the current process NUMA policy.
1563  *      Returns NULL when no page can be allocated.
1564  *
1565  *      Don't call cpuset_update_task_memory_state() unless
1566  *      1) it's ok to take cpuset_sem (can WAIT), and
1567  *      2) allocating for current task (not interrupt).
1568  */
1569 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1570 {
1571         struct mempolicy *pol = current->mempolicy;
1572
1573         if ((gfp & __GFP_WAIT) && !in_interrupt())
1574                 cpuset_update_task_memory_state();
1575         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1576                 pol = &default_policy;
1577         if (pol->policy == MPOL_INTERLEAVE)
1578                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1579         return __alloc_pages_nodemask(gfp, order,
1580                         zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1581 }
1582 EXPORT_SYMBOL(alloc_pages_current);
1583
1584 /*
1585  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1586  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1587  * with the mems_allowed returned by cpuset_mems_allowed().  This
1588  * keeps mempolicies cpuset relative after its cpuset moves.  See
1589  * further kernel/cpuset.c update_nodemask().
1590  */
1591
1592 /* Slow path of a mempolicy duplicate */
1593 struct mempolicy *__mpol_dup(struct mempolicy *old)
1594 {
1595         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1596
1597         if (!new)
1598                 return ERR_PTR(-ENOMEM);
1599         if (current_cpuset_is_being_rebound()) {
1600                 nodemask_t mems = cpuset_mems_allowed(current);
1601                 mpol_rebind_policy(old, &mems);
1602         }
1603         *new = *old;
1604         atomic_set(&new->refcnt, 1);
1605         return new;
1606 }
1607
1608 static int mpol_match_intent(const struct mempolicy *a,
1609                              const struct mempolicy *b)
1610 {
1611         if (a->flags != b->flags)
1612                 return 0;
1613         if (!mpol_store_user_nodemask(a))
1614                 return 1;
1615         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1616 }
1617
1618 /* Slow path of a mempolicy comparison */
1619 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1620 {
1621         if (!a || !b)
1622                 return 0;
1623         if (a->policy != b->policy)
1624                 return 0;
1625         if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1626                 return 0;
1627         switch (a->policy) {
1628         case MPOL_DEFAULT:
1629                 return 1;
1630         case MPOL_BIND:
1631                 /* Fall through */
1632         case MPOL_INTERLEAVE:
1633                 return nodes_equal(a->v.nodes, b->v.nodes);
1634         case MPOL_PREFERRED:
1635                 return a->v.preferred_node == b->v.preferred_node;
1636         default:
1637                 BUG();
1638                 return 0;
1639         }
1640 }
1641
1642 /* Slow path of a mpol destructor. */
1643 void __mpol_put(struct mempolicy *p)
1644 {
1645         if (!atomic_dec_and_test(&p->refcnt))
1646                 return;
1647         p->policy = MPOL_DEFAULT;
1648         kmem_cache_free(policy_cache, p);
1649 }
1650
1651 /*
1652  * Shared memory backing store policy support.
1653  *
1654  * Remember policies even when nobody has shared memory mapped.
1655  * The policies are kept in Red-Black tree linked from the inode.
1656  * They are protected by the sp->lock spinlock, which should be held
1657  * for any accesses to the tree.
1658  */
1659
1660 /* lookup first element intersecting start-end */
1661 /* Caller holds sp->lock */
1662 static struct sp_node *
1663 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1664 {
1665         struct rb_node *n = sp->root.rb_node;
1666
1667         while (n) {
1668                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1669
1670                 if (start >= p->end)
1671                         n = n->rb_right;
1672                 else if (end <= p->start)
1673                         n = n->rb_left;
1674                 else
1675                         break;
1676         }
1677         if (!n)
1678                 return NULL;
1679         for (;;) {
1680                 struct sp_node *w = NULL;
1681                 struct rb_node *prev = rb_prev(n);
1682                 if (!prev)
1683                         break;
1684                 w = rb_entry(prev, struct sp_node, nd);
1685                 if (w->end <= start)
1686                         break;
1687                 n = prev;
1688         }
1689         return rb_entry(n, struct sp_node, nd);
1690 }
1691
1692 /* Insert a new shared policy into the list. */
1693 /* Caller holds sp->lock */
1694 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1695 {
1696         struct rb_node **p = &sp->root.rb_node;
1697         struct rb_node *parent = NULL;
1698         struct sp_node *nd;
1699
1700         while (*p) {
1701                 parent = *p;
1702                 nd = rb_entry(parent, struct sp_node, nd);
1703                 if (new->start < nd->start)
1704                         p = &(*p)->rb_left;
1705                 else if (new->end > nd->end)
1706                         p = &(*p)->rb_right;
1707                 else
1708                         BUG();
1709         }
1710         rb_link_node(&new->nd, parent, p);
1711         rb_insert_color(&new->nd, &sp->root);
1712         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1713                  new->policy ? new->policy->policy : 0);
1714 }
1715
1716 /* Find shared policy intersecting idx */
1717 struct mempolicy *
1718 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1719 {
1720         struct mempolicy *pol = NULL;
1721         struct sp_node *sn;
1722
1723         if (!sp->root.rb_node)
1724                 return NULL;
1725         spin_lock(&sp->lock);
1726         sn = sp_lookup(sp, idx, idx+1);
1727         if (sn) {
1728                 mpol_get(sn->policy);
1729                 pol = sn->policy;
1730         }
1731         spin_unlock(&sp->lock);
1732         return pol;
1733 }
1734
1735 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1736 {
1737         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1738         rb_erase(&n->nd, &sp->root);
1739         mpol_put(n->policy);
1740         kmem_cache_free(sn_cache, n);
1741 }
1742
1743 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1744                                 struct mempolicy *pol)
1745 {
1746         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1747
1748         if (!n)
1749                 return NULL;
1750         n->start = start;
1751         n->end = end;
1752         mpol_get(pol);
1753         n->policy = pol;
1754         return n;
1755 }
1756
1757 /* Replace a policy range. */
1758 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1759                                  unsigned long end, struct sp_node *new)
1760 {
1761         struct sp_node *n, *new2 = NULL;
1762
1763 restart:
1764         spin_lock(&sp->lock);
1765         n = sp_lookup(sp, start, end);
1766         /* Take care of old policies in the same range. */
1767         while (n && n->start < end) {
1768                 struct rb_node *next = rb_next(&n->nd);
1769                 if (n->start >= start) {
1770                         if (n->end <= end)
1771                                 sp_delete(sp, n);
1772                         else
1773                                 n->start = end;
1774                 } else {
1775                         /* Old policy spanning whole new range. */
1776                         if (n->end > end) {
1777                                 if (!new2) {
1778                                         spin_unlock(&sp->lock);
1779                                         new2 = sp_alloc(end, n->end, n->policy);
1780                                         if (!new2)
1781                                                 return -ENOMEM;
1782                                         goto restart;
1783                                 }
1784                                 n->end = start;
1785                                 sp_insert(sp, new2);
1786                                 new2 = NULL;
1787                                 break;
1788                         } else
1789                                 n->end = start;
1790                 }
1791                 if (!next)
1792                         break;
1793                 n = rb_entry(next, struct sp_node, nd);
1794         }
1795         if (new)
1796                 sp_insert(sp, new);
1797         spin_unlock(&sp->lock);
1798         if (new2) {
1799                 mpol_put(new2->policy);
1800                 kmem_cache_free(sn_cache, new2);
1801         }
1802         return 0;
1803 }
1804
1805 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1806                         unsigned short flags, nodemask_t *policy_nodes)
1807 {
1808         info->root = RB_ROOT;
1809         spin_lock_init(&info->lock);
1810
1811         if (policy != MPOL_DEFAULT) {
1812                 struct mempolicy *newpol;
1813
1814                 /* Falls back to MPOL_DEFAULT on any error */
1815                 newpol = mpol_new(policy, flags, policy_nodes);
1816                 if (!IS_ERR(newpol)) {
1817                         /* Create pseudo-vma that contains just the policy */
1818                         struct vm_area_struct pvma;
1819
1820                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1821                         /* Policy covers entire file */
1822                         pvma.vm_end = TASK_SIZE;
1823                         mpol_set_shared_policy(info, &pvma, newpol);
1824                         mpol_put(newpol);
1825                 }
1826         }
1827 }
1828
1829 int mpol_set_shared_policy(struct shared_policy *info,
1830                         struct vm_area_struct *vma, struct mempolicy *npol)
1831 {
1832         int err;
1833         struct sp_node *new = NULL;
1834         unsigned long sz = vma_pages(vma);
1835
1836         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1837                  vma->vm_pgoff,
1838                  sz, npol ? npol->policy : -1,
1839                  npol ? npol->flags : -1,
1840                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1841
1842         if (npol) {
1843                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1844                 if (!new)
1845                         return -ENOMEM;
1846         }
1847         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1848         if (err && new)
1849                 kmem_cache_free(sn_cache, new);
1850         return err;
1851 }
1852
1853 /* Free a backing policy store on inode delete. */
1854 void mpol_free_shared_policy(struct shared_policy *p)
1855 {
1856         struct sp_node *n;
1857         struct rb_node *next;
1858
1859         if (!p->root.rb_node)
1860                 return;
1861         spin_lock(&p->lock);
1862         next = rb_first(&p->root);
1863         while (next) {
1864                 n = rb_entry(next, struct sp_node, nd);
1865                 next = rb_next(&n->nd);
1866                 rb_erase(&n->nd, &p->root);
1867                 mpol_put(n->policy);
1868                 kmem_cache_free(sn_cache, n);
1869         }
1870         spin_unlock(&p->lock);
1871 }
1872
1873 /* assumes fs == KERNEL_DS */
1874 void __init numa_policy_init(void)
1875 {
1876         nodemask_t interleave_nodes;
1877         unsigned long largest = 0;
1878         int nid, prefer = 0;
1879
1880         policy_cache = kmem_cache_create("numa_policy",
1881                                          sizeof(struct mempolicy),
1882                                          0, SLAB_PANIC, NULL);
1883
1884         sn_cache = kmem_cache_create("shared_policy_node",
1885                                      sizeof(struct sp_node),
1886                                      0, SLAB_PANIC, NULL);
1887
1888         /*
1889          * Set interleaving policy for system init. Interleaving is only
1890          * enabled across suitably sized nodes (default is >= 16MB), or
1891          * fall back to the largest node if they're all smaller.
1892          */
1893         nodes_clear(interleave_nodes);
1894         for_each_node_state(nid, N_HIGH_MEMORY) {
1895                 unsigned long total_pages = node_present_pages(nid);
1896
1897                 /* Preserve the largest node */
1898                 if (largest < total_pages) {
1899                         largest = total_pages;
1900                         prefer = nid;
1901                 }
1902
1903                 /* Interleave this node? */
1904                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1905                         node_set(nid, interleave_nodes);
1906         }
1907
1908         /* All too small, use the largest */
1909         if (unlikely(nodes_empty(interleave_nodes)))
1910                 node_set(prefer, interleave_nodes);
1911
1912         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1913                 printk("numa_policy_init: interleaving failed\n");
1914 }
1915
1916 /* Reset policy of current process to default */
1917 void numa_default_policy(void)
1918 {
1919         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1920 }
1921
1922 /*
1923  * Display pages allocated per node and memory policy via /proc.
1924  */
1925 static const char * const policy_types[] =
1926         { "default", "prefer", "bind", "interleave" };
1927
1928 /*
1929  * Convert a mempolicy into a string.
1930  * Returns the number of characters in buffer (if positive)
1931  * or an error (negative)
1932  */
1933 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1934 {
1935         char *p = buffer;
1936         int l;
1937         nodemask_t nodes;
1938         unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1939         unsigned short flags = pol ? pol->flags : 0;
1940
1941         switch (mode) {
1942         case MPOL_DEFAULT:
1943                 nodes_clear(nodes);
1944                 break;
1945
1946         case MPOL_PREFERRED:
1947                 nodes_clear(nodes);
1948                 node_set(pol->v.preferred_node, nodes);
1949                 break;
1950
1951         case MPOL_BIND:
1952                 /* Fall through */
1953         case MPOL_INTERLEAVE:
1954                 nodes = pol->v.nodes;
1955                 break;
1956
1957         default:
1958                 BUG();
1959                 return -EFAULT;
1960         }
1961
1962         l = strlen(policy_types[mode]);
1963         if (buffer + maxlen < p + l + 1)
1964                 return -ENOSPC;
1965
1966         strcpy(p, policy_types[mode]);
1967         p += l;
1968
1969         if (flags) {
1970                 int need_bar = 0;
1971
1972                 if (buffer + maxlen < p + 2)
1973                         return -ENOSPC;
1974                 *p++ = '=';
1975
1976                 if (flags & MPOL_F_STATIC_NODES)
1977                         p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1978                 if (flags & MPOL_F_RELATIVE_NODES)
1979                         p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1980         }
1981
1982         if (!nodes_empty(nodes)) {
1983                 if (buffer + maxlen < p + 2)
1984                         return -ENOSPC;
1985                 *p++ = '=';
1986                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1987         }
1988         return p - buffer;
1989 }
1990
1991 struct numa_maps {
1992         unsigned long pages;
1993         unsigned long anon;
1994         unsigned long active;
1995         unsigned long writeback;
1996         unsigned long mapcount_max;
1997         unsigned long dirty;
1998         unsigned long swapcache;
1999         unsigned long node[MAX_NUMNODES];
2000 };
2001
2002 static void gather_stats(struct page *page, void *private, int pte_dirty)
2003 {
2004         struct numa_maps *md = private;
2005         int count = page_mapcount(page);
2006
2007         md->pages++;
2008         if (pte_dirty || PageDirty(page))
2009                 md->dirty++;
2010
2011         if (PageSwapCache(page))
2012                 md->swapcache++;
2013
2014         if (PageActive(page))
2015                 md->active++;
2016
2017         if (PageWriteback(page))
2018                 md->writeback++;
2019
2020         if (PageAnon(page))
2021                 md->anon++;
2022
2023         if (count > md->mapcount_max)
2024                 md->mapcount_max = count;
2025
2026         md->node[page_to_nid(page)]++;
2027 }
2028
2029 #ifdef CONFIG_HUGETLB_PAGE
2030 static void check_huge_range(struct vm_area_struct *vma,
2031                 unsigned long start, unsigned long end,
2032                 struct numa_maps *md)
2033 {
2034         unsigned long addr;
2035         struct page *page;
2036
2037         for (addr = start; addr < end; addr += HPAGE_SIZE) {
2038                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2039                 pte_t pte;
2040
2041                 if (!ptep)
2042                         continue;
2043
2044                 pte = *ptep;
2045                 if (pte_none(pte))
2046                         continue;
2047
2048                 page = pte_page(pte);
2049                 if (!page)
2050                         continue;
2051
2052                 gather_stats(page, md, pte_dirty(*ptep));
2053         }
2054 }
2055 #else
2056 static inline void check_huge_range(struct vm_area_struct *vma,
2057                 unsigned long start, unsigned long end,
2058                 struct numa_maps *md)
2059 {
2060 }
2061 #endif
2062
2063 int show_numa_map(struct seq_file *m, void *v)
2064 {
2065         struct proc_maps_private *priv = m->private;
2066         struct vm_area_struct *vma = v;
2067         struct numa_maps *md;
2068         struct file *file = vma->vm_file;
2069         struct mm_struct *mm = vma->vm_mm;
2070         struct mempolicy *pol;
2071         int n;
2072         char buffer[50];
2073
2074         if (!mm)
2075                 return 0;
2076
2077         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2078         if (!md)
2079                 return 0;
2080
2081         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2082         mpol_to_str(buffer, sizeof(buffer), pol);
2083         /*
2084          * unref shared or other task's mempolicy
2085          */
2086         if (pol != &default_policy && pol != current->mempolicy)
2087                 __mpol_put(pol);
2088
2089         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2090
2091         if (file) {
2092                 seq_printf(m, " file=");
2093                 seq_path(m, &file->f_path, "\n\t= ");
2094         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2095                 seq_printf(m, " heap");
2096         } else if (vma->vm_start <= mm->start_stack &&
2097                         vma->vm_end >= mm->start_stack) {
2098                 seq_printf(m, " stack");
2099         }
2100
2101         if (is_vm_hugetlb_page(vma)) {
2102                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2103                 seq_printf(m, " huge");
2104         } else {
2105                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2106                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2107         }
2108
2109         if (!md->pages)
2110                 goto out;
2111
2112         if (md->anon)
2113                 seq_printf(m," anon=%lu",md->anon);
2114
2115         if (md->dirty)
2116                 seq_printf(m," dirty=%lu",md->dirty);
2117
2118         if (md->pages != md->anon && md->pages != md->dirty)
2119                 seq_printf(m, " mapped=%lu", md->pages);
2120
2121         if (md->mapcount_max > 1)
2122                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2123
2124         if (md->swapcache)
2125                 seq_printf(m," swapcache=%lu", md->swapcache);
2126
2127         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2128                 seq_printf(m," active=%lu", md->active);
2129
2130         if (md->writeback)
2131                 seq_printf(m," writeback=%lu", md->writeback);
2132
2133         for_each_node_state(n, N_HIGH_MEMORY)
2134                 if (md->node[n])
2135                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2136 out:
2137         seq_putc(m, '\n');
2138         kfree(md);
2139
2140         if (m->count < m->size)
2141                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2142         return 0;
2143 }