#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
+
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* must either hold the mmap_sem for write, or the mmap_sem for read and
* the hugetlb_instantiation mutex:
*
- * down_write(&mm->mmap_sem);
+ * down_write(&mm->mmap_sem);
* or
- * down_read(&mm->mmap_sem);
- * mutex_lock(&hugetlb_instantiation_mutex);
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
- int seg_from;
- int seg_to;
+ long seg_from;
+ long seg_to;
if (rg->to <= f)
continue;
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
}
if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
- list_del(&page->lru);
+ list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct address_space *mapping;
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
- mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- if (mapping)
- hugetlb_put_quota(mapping, 1);
+ hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
+ INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
return dtor == free_huge_page;
}
-
EXPORT_SYMBOL_GPL(PageHuge);
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
- return NULL;
+ page = NULL;
}
spin_lock(&hugetlb_lock);
if (page) {
+ INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
*/
struct page *page, *tmp;
int ret, i;
int needed, allocated;
+ bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page)
- /*
- * We were not able to allocate enough pages to
- * satisfy the entire reservation so we free what
- * we've allocated so far.
- */
- goto free;
-
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
list_add(&page->lru, &surplus_list);
}
- allocated += needed;
+ allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
- if (needed > 0)
- goto retry;
-
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
h->resv_huge_pages += delta;
ret = 0;
- spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
+free:
+ spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
-free:
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- list_del(&page->lru);
put_page(page);
}
}
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed. Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
+ struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no reserves and
- * will not have accounted against quota. Check that the quota can be
- * made before satisfying the allocation
- * MAP_NORESERVE mappings may also need pages and quota allocated
- * if no reserve mapping overlaps.
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
+ return ERR_PTR(-ENOMEM);
if (chg)
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-ENOSPC);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- spin_unlock(&hugetlb_lock);
-
- if (!page) {
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
+ spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
- hugetlb_put_quota(inode->i_mapping, chg);
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
}
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ list_move(&page->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
}
- set_page_private(page, (unsigned long) mapping);
+ set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
-
return page;
}
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
}
}
struct attribute_group *hstate_attr_group)
{
int retval;
- int hi = h - hstates;
+ int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node sysdevs in node_devices[] using a parallel array. The array
- * index of a node sysdev or _hstate == node id.
- * This is here to avoid any static dependency of the node sysdev driver, in
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
struct node_hstate node_hstates[MAX_NUMNODES];
/*
- * A subset of global hstate attributes for node sysdevs
+ * A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
};
/*
- * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
}
/*
- * Unregister hstate attributes from a single node sysdev.
+ * Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
- for_each_hstate(h)
- if (nhs->hstate_kobjs[h - hstates]) {
- kobject_put(nhs->hstate_kobjs[h - hstates]);
- nhs->hstate_kobjs[h - hstates] = NULL;
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
}
+ }
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
}
/*
- * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * hugetlb module exit: unregister hstate attributes from node devices
* that have them.
*/
static void hugetlb_unregister_all_nodes(void)
int nid;
/*
- * disable node sysdev registrations.
+ * disable node device registrations.
*/
register_hugetlbfs_with_node(NULL, NULL);
}
/*
- * Register hstate attributes for a single node sysdev.
+ * Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
void hugetlb_register_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->sysdev.kobj);
+ &node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
if (err) {
printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
" for node %d\n",
- h->name, node->sysdev.id);
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
/*
* hugetlb init time: register hstate attributes for all registered node
- * sysdevs of nodes that have memory. All on-line nodes should have
- * registered their associated sysdev by this time.
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
*/
static void hugetlb_register_all_nodes(void)
{
for_each_node_state(nid, N_HIGH_MEMORY) {
struct node *node = &node_devices[nid];
- if (node->sysdev.id == nid)
+ if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
- * Let the node sysdev driver know we're here so it can
+ * Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
hugetlb_unregister_all_nodes();
for_each_hstate(h) {
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hstate_index(h)]);
}
kobject_put(hugepages_kobj);
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
- default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
if (default_hstate_max_huge_pages)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
return;
}
- BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
- h = &hstates[max_hstate++];
+ h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgoup details.
+ */
+ if (order >= HUGETLB_CGROUP_MIN_ORDER)
+ hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
static unsigned long *last_mhp;
/*
- * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!max_hstate)
+ if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
- if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
kref_get(&reservations->refs);
}
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (!reservations)
+ return;
+ kref_put(&reservations->refs, resv_map_release);
+}
+
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *reservations = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
reserve = (end - start) -
region_count(&reservations->regions, start, end);
- kref_put(&reservations->refs, resv_map_release);
+ resv_map_put(vma);
if (reserve) {
hugetlb_acct_memory(h, -reserve);
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
}
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
return entry;
}
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
- if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
- }
}
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ if (non_swap_entry(swp) && is_migration_entry(swp))
return 1;
- } else
+ else
return 0;
}
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
return 1;
- } else
+ else
return 0;
}
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
- struct page *tmp;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- /*
- * A page gathering list, protected by per file i_mmap_lock. The
- * lock is used to avoid list corruption from multiple unmapping
- * of the same page since we are using page->lru.
- */
- LIST_HEAD(page_list);
-
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, start, end);
+again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (huge_pmd_unshare(mm, &address, ptep))
continue;
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
+ page = pte_page(pte);
/*
* If a reference page is supplied, it is because a specific
* page is being unmapped, not a range. Ensure the page we
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
- pte = huge_ptep_get(ptep);
- if (huge_pte_none(pte))
- continue;
- page = pte_page(pte);
if (page != ref_page)
continue;
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- if (huge_pte_none(pte))
- continue;
-
- /*
- * HWPoisoned hugepage is already unmapped and dropped reference
- */
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
- continue;
-
- page = pte_page(pte);
+ tlb_remove_tlb_entry(tlb, ptep, address);
if (pte_dirty(pte))
set_page_dirty(page);
- list_add(&page->lru, &page_list);
+
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page)
+ break;
}
spin_unlock(&mm->page_table_lock);
- flush_tlb_range(vma, start, end);
- mmu_notifier_invalidate_range_end(mm, start, end);
- list_for_each_entry_safe(page, tmp, &page_list, lru) {
- page_remove_rmap(page);
- list_del(&page->lru);
- put_page(page);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
}
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_end_vma(tlb, vma);
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
- __unmap_hugepage_range(vma, start, end, ref_page);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, 0);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
- + (vma->vm_pgoff >> PAGE_SHIFT);
- mapping = (struct address_space *)page_private(page);
+ pgoff = vma_hugecache_offset(h, vma, address);
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- __unmap_hugepage_range(iter_vma,
- address, address + huge_page_size(h),
- page);
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return 1;
}
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
/*
if (outside_reserve) {
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
- goto retry_avoidcopy;
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
+ */
+ return 0;
}
WARN_ON_ONCE(1);
}
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
- return -PTR_ERR(new_page);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
}
/*
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
{
struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
+ int anon_rmap = 0;
pgoff_t idx;
unsigned long size;
struct page *page;
goto out;
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
- ret = -PTR_ERR(page);
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
- page_dup_rmap(page);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
- hugepage_add_new_anon_rmap(page, vma, address);
+ anon_rmap = 1;
}
} else {
/*
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
- VM_FAULT_SET_HINDEX(h - hstates);
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
- page_dup_rmap(page);
}
/*
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
+ if (anon_rmap)
+ hugepage_add_new_anon_rmap(page, vma, address);
+ else
+ page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ address &= huge_page_mask(h);
+
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(h - hstates);
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
* so no worry about deadlock.
*/
page = pte_page(entry);
+ get_page(page);
if (page != pagecache_page)
lock_page(page);
}
if (page != pagecache_page)
unlock_page(page);
+ put_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
ptep = huge_pte_offset(mm, address);
}
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
flush_tlb_range(vma, start, end);
}
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
- int acctflag)
+ vm_flags_t vm_flags)
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
- * and filesystem quota without using reserves
+ * without using reserves
*/
- if (acctflag & VM_NORESERVE)
+ if (vm_flags & VM_NORESERVE)
return 0;
/*
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0)
- return chg;
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
- /* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg))
- return -ENOSPC;
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
/*
* Check enough hugepages are available for the reservation.
- * Hand back the quota if there are not
+ * Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
- hugetlb_put_quota(inode->i_mapping, chg);
- return ret;
+ hugepage_subpool_put_pages(spool, chg);
+ goto out_err;
}
/*
if (!vma || vma->vm_flags & VM_MAYSHARE)
region_add(&inode->i_mapping->private_list, from, to);
return 0;
+out_err:
+ if (vma)
+ resv_map_put(vma);
+ return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}