hugetlbfs: per mount huge page sizes
Andi Kleen [Thu, 24 Jul 2008 04:27:43 +0000 (21:27 -0700)]
Add the ability to configure the hugetlb hstate used on a per mount basis.

- Add a new pagesize= option to the hugetlbfs mount that allows setting
  the page size
- This option causes the mount code to find the hstate corresponding to the
  specified size, and sets up a pointer to the hstate in the mount's
  superblock.
- Change the hstate accessors to use this information rather than the
  global_hstate they were using (requires a slight change in mm/memory.c
  so we don't NULL deref in the error-unmap path -- see comments).

[np: take hstate out of hugetlbfs inode and vma->vm_private_data]

Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

fs/hugetlbfs/inode.c
include/linux/hugetlb.h
mm/hugetlb.c
mm/memory.c

index 516c581..dbd01d2 100644 (file)
@@ -53,6 +53,7 @@ int sysctl_hugetlb_shm_group;
 enum {
        Opt_size, Opt_nr_inodes,
        Opt_mode, Opt_uid, Opt_gid,
+       Opt_pagesize,
        Opt_err,
 };
 
@@ -62,6 +63,7 @@ static match_table_t tokens = {
        {Opt_mode,      "mode=%o"},
        {Opt_uid,       "uid=%u"},
        {Opt_gid,       "gid=%u"},
+       {Opt_pagesize,  "pagesize=%s"},
        {Opt_err,       NULL},
 };
 
@@ -750,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
        char *p, *rest;
        substring_t args[MAX_OPT_ARGS];
        int option;
+       unsigned long long size = 0;
+       enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
 
        if (!options)
                return 0;
@@ -780,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        break;
 
                case Opt_size: {
-                       unsigned long long size;
                        /* memparse() will accept a K/M/G without a digit */
                        if (!isdigit(*args[0].from))
                                goto bad_val;
                        size = memparse(args[0].from, &rest);
-                       if (*rest == '%') {
-                               size <<= HPAGE_SHIFT;
-                               size *= max_huge_pages;
-                               do_div(size, 100);
-                       }
-                       pconfig->nr_blocks = (size >> HPAGE_SHIFT);
+                       setsize = SIZE_STD;
+                       if (*rest == '%')
+                               setsize = SIZE_PERCENT;
                        break;
                }
 
@@ -801,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        pconfig->nr_inodes = memparse(args[0].from, &rest);
                        break;
 
+               case Opt_pagesize: {
+                       unsigned long ps;
+                       ps = memparse(args[0].from, &rest);
+                       pconfig->hstate = size_to_hstate(ps);
+                       if (!pconfig->hstate) {
+                               printk(KERN_ERR
+                               "hugetlbfs: Unsupported page size %lu MB\n",
+                                       ps >> 20);
+                               return -EINVAL;
+                       }
+                       break;
+               }
+
                default:
                        printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
                                 p);
@@ -808,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        break;
                }
        }
+
+       /* Do size after hstate is set up */
+       if (setsize > NO_SIZE) {
+               struct hstate *h = pconfig->hstate;
+               if (setsize == SIZE_PERCENT) {
+                       size <<= huge_page_shift(h);
+                       size *= h->max_huge_pages;
+                       do_div(size, 100);
+               }
+               pconfig->nr_blocks = (size >> huge_page_shift(h));
+       }
+
        return 0;
 
 bad_val:
@@ -832,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        config.uid = current->fsuid;
        config.gid = current->fsgid;
        config.mode = 0755;
+       config.hstate = &default_hstate;
        ret = hugetlbfs_parse_options(data, &config);
        if (ret)
                return ret;
@@ -840,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbinfo)
                return -ENOMEM;
        sb->s_fs_info = sbinfo;
+       sbinfo->hstate = config.hstate;
        spin_lock_init(&sbinfo->stat_lock);
        sbinfo->max_blocks = config.nr_blocks;
        sbinfo->free_blocks = config.nr_blocks;
        sbinfo->max_inodes = config.nr_inodes;
        sbinfo->free_inodes = config.nr_inodes;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-       sb->s_blocksize = HPAGE_SIZE;
-       sb->s_blocksize_bits = HPAGE_SHIFT;
+       sb->s_blocksize = huge_page_size(config.hstate);
+       sb->s_blocksize_bits = huge_page_shift(config.hstate);
        sb->s_magic = HUGETLBFS_MAGIC;
        sb->s_op = &hugetlbfs_ops;
        sb->s_time_gran = 1;
index b75bdb4..ba9263e 100644 (file)
@@ -100,6 +100,7 @@ struct hugetlbfs_config {
        umode_t mode;
        long    nr_blocks;
        long    nr_inodes;
+       struct hstate *hstate;
 };
 
 struct hugetlbfs_sb_info {
@@ -108,6 +109,7 @@ struct hugetlbfs_sb_info {
        long    max_inodes;   /* inodes allowed */
        long    free_inodes;  /* inodes free */
        spinlock_t      stat_lock;
+       struct hstate *hstate;
 };
 
 
@@ -191,19 +193,21 @@ extern unsigned int default_hstate_idx;
 
 #define default_hstate (hstates[default_hstate_idx])
 
-static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+static inline struct hstate *hstate_inode(struct inode *i)
 {
-       return &default_hstate;
+       struct hugetlbfs_sb_info *hsb;
+       hsb = HUGETLBFS_SB(i->i_sb);
+       return hsb->hstate;
 }
 
 static inline struct hstate *hstate_file(struct file *f)
 {
-       return &default_hstate;
+       return hstate_inode(f->f_dentry->d_inode);
 }
 
-static inline struct hstate *hstate_inode(struct inode *i)
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 {
-       return &default_hstate;
+       return hstate_file(vma->vm_file);
 }
 
 static inline unsigned long huge_page_size(struct hstate *h)
index 82378d4..4cf7a90 100644 (file)
@@ -1439,19 +1439,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-       /*
-        * It is undesirable to test vma->vm_file as it should be non-null
-        * for valid hugetlb area. However, vm_file will be NULL in the error
-        * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
-        * do_mmap_pgoff() nullifies vma->vm_file before calling this function
-        * to clean up. Since no pte has actually been setup, it is safe to
-        * do nothing in this case.
-        */
-       if (vma->vm_file) {
-               spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-               __unmap_hugepage_range(vma, start, end, ref_page);
-               spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
-       }
+       spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+       __unmap_hugepage_range(vma, start, end, ref_page);
+       spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 }
 
 /*
index c1c1d6d..02fc6b1 100644 (file)
@@ -901,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        }
 
                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                               unmap_hugepage_range(vma, start, end, NULL);
-                               zap_work -= (end - start) /
+                               /*
+                                * It is undesirable to test vma->vm_file as it
+                                * should be non-null for valid hugetlb area.
+                                * However, vm_file will be NULL in the error
+                                * cleanup path of do_mmap_pgoff. When
+                                * hugetlbfs ->mmap method fails,
+                                * do_mmap_pgoff() nullifies vma->vm_file
+                                * before calling this function to clean up.
+                                * Since no pte has actually been setup, it is
+                                * safe to do nothing in this case.
+                                */
+                               if (vma->vm_file) {
+                                       unmap_hugepage_range(vma, start, end, NULL);
+                                       zap_work -= (end - start) /
                                        pages_per_huge_page(hstate_vma(vma));
+                               }
+
                                start = end;
                        } else
                                start = unmap_page_range(*tlbp, vma,