* Swap reorganised 29.12.95, Stephen Tweedie
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
long total_swap_pages;
static int swap_overflow;
-EXPORT_SYMBOL(total_swap_pages);
-
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
struct swap_list_t swap_list = {-1, -1};
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
*/
static DECLARE_RWSEM(swap_unplug_sem);
swp_entry_t entry;
down_read(&swap_unplug_sem);
- entry.val = page->private;
+ entry.val = page_private(page);
if (PageSwapCache(page)) {
struct block_device *bdev = swap_info[swp_type(entry)].bdev;
struct backing_dev_info *bdi;
/*
* If the page is removed from swapcache from under us (with a
* racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page->private above. If
- * the WARN_ON triggers during a swapoff it maybe the race
+ * count to avoid reading garbage from page_private(page) above.
+ * If the WARN_ON triggers during a swapoff it maybe the race
* condition and it's harmless. However if it triggers without
* swapoff it signals a problem.
*/
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
spin_lock(&swap_lock);
- si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+ si->cluster_next = offset-SWAPFILE_CLUSTER+1;
goto cluster;
}
if (unlikely(--latency_ration < 0)) {
return (swp_entry_t) {0};
}
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+
+ spin_lock(&swap_lock);
+ si = swap_info + type;
+ if (si->flags & SWP_WRITEOK) {
+ nr_swap_pages--;
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ nr_swap_pages++;
+ }
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
static struct swap_info_struct * swap_info_get(swp_entry_t entry)
{
struct swap_info_struct * p;
struct swap_info_struct *p;
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
/* Subtract the 1 for the swap cache itself */
if (page_count(page) != 2) /* 2: us + cache */
return 0;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (!p)
return 0;
struct swap_info_struct * p;
struct page *page = NULL;
+ if (is_migration_entry(entry))
+ return;
+
p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1)
- page = find_trylock_page(&swapper_space, entry.val);
+ if (swap_entry_free(p, swp_offset(entry)) == 1) {
+ page = find_get_page(&swapper_space, entry.val);
+ if (page && unlikely(TestSetPageLocked(page))) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
spin_unlock(&swap_lock);
}
if (page) {
int one_user;
BUG_ON(PagePrivate(page));
- page_cache_get(page);
one_user = (page_count(page) == 2);
/* Only cache user (+us), or swap space full? Free it! */
- if (!PageWriteback(page) && (one_user || vm_swap_full())) {
+ /* Also recheck PageSwapCache after page is locked (above) */
+ if (PageSwapCache(page) && !PageWriteback(page) &&
+ (one_user || vm_swap_full())) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
}
}
+#ifdef CONFIG_HIBERNATION
/*
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited). We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
+ * Find the swap type that corresponds to given device (if any).
+ *
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
*
- * vma->vm_mm->page_table_lock is held.
+ * This is needed for the suspend to disk (aka swsusp).
+ */
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
+{
+ struct block_device *bdev = NULL;
+ int i;
+
+ if (device)
+ bdev = bdget(device);
+
+ spin_lock(&swap_lock);
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *sis = swap_info + i;
+
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+
+ if (!bdev) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
+ spin_unlock(&swap_lock);
+ return i;
+ }
+ if (bdev == sis->bdev) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ if (se->start_block == offset) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
+ spin_unlock(&swap_lock);
+ bdput(bdev);
+ return i;
+ }
+ }
+ }
+ spin_unlock(&swap_lock);
+ if (bdev)
+ bdput(bdev);
+
+ return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+ unsigned int n = 0;
+
+ if (type < nr_swapfiles) {
+ spin_lock(&swap_lock);
+ if (swap_info[type].flags & SWP_WRITEOK) {
+ n = swap_info[type].pages;
+ if (free)
+ n -= swap_info[type].inuse_pages;
+ }
+ spin_unlock(&swap_lock);
+ }
+ return n;
+}
+#endif
+
+/*
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
*/
static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- inc_mm_counter(vma->vm_mm, rss);
+ inc_mm_counter(vma->vm_mm, anon_rss);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
{
- pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+ spinlock_t *ptl;
+ int found = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte, addr, entry, page);
- pte_unmap(pte);
- return 1;
+ unuse_pte(vma, pte++, addr, entry, page);
+ found = 1;
+ break;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- return 0;
+ pte_unmap_unlock(pte - 1, ptl);
+ return found;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
down_read(&mm->mmap_sem);
lock_page(page);
}
- spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
- spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem);
/*
* Currently unuse_mm cannot fail, but leave error handling
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
- page = read_swap_cache_async(entry, NULL, 0);
+ page = read_swap_cache_async(entry,
+ GFP_HIGHUSER_MOVABLE, NULL, 0);
if (!page) {
/*
* Either swap_duplicate() failed because entry
while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
- if (atomic_inc_return(&mm->mm_users) == 1) {
- atomic_dec(&mm->mm_users);
+ if (!atomic_inc_not_zero(&mm->mm_users))
continue;
- }
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
/*
* So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
+ * mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
unlock_page(page);
}
}
+#ifdef CONFIG_HIBERNATION
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+ struct swap_info_struct *sis;
+
+ if (swap_type >= nr_swapfiles)
+ return 0;
+
+ sis = swap_info + swap_type;
+ return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_HIBERNATION */
+
/*
* Free all of a swapdev's extent information
*/
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
up_write(&swap_unplug_sem);
destroy_swap_extents(p);
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
drain_mmlist();
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&swap_lock);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
}
p->swap_map = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
vfree(swap_map);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
set_blocksize(bdev, p->old_block_size);
bd_release(bdev);
} else {
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
err = 0;
int i;
loff_t l = *pos;
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
+
+ if (!l)
+ return SEQ_START_TOKEN;
for (i = 0; i < nr_swapfiles; i++, ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
- if (!l--)
+ if (!--l)
return ptr;
}
static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *ptr;
struct swap_info_struct *endptr = swap_info + nr_swapfiles;
- for (++ptr; ptr < endptr; ptr++) {
+ if (v == SEQ_START_TOKEN)
+ ptr = swap_info;
+ else {
+ ptr = v;
+ ptr++;
+ }
+
+ for (; ptr < endptr; ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
++*pos;
static void swap_stop(struct seq_file *swap, void *v)
{
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
}
static int swap_show(struct seq_file *swap, void *v)
struct file *file;
int len;
- if (v == swap_info)
- seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ if (ptr == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ return 0;
+ }
file = ptr->swap_file;
- len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+ len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
ptr->pages << (PAGE_SHIFT - 10),
ptr->inuse_pages << (PAGE_SHIFT - 10),
return 0;
}
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
.start = swap_start,
.next = swap_next,
.stop = swap_stop,
return seq_open(file, &swaps_op);
}
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
.open = swaps_open,
.read = seq_read,
.llseek = seq_lseek,
if (!(p->flags & SWP_USED))
break;
error = -EPERM;
- /*
- * Test if adding another swap device is possible. There are
- * two limiting factors: 1) the number of bits for the swap
- * type swp_entry_t definition and 2) the number of bits for
- * the swap type in the swap ptes as defined by the different
- * architectures. To honor both limitations a swap entry
- * with swap offset 0 and swap type ~0UL is created, encoded
- * to a swap pte, decoded to a swp_entry_t again and finally
- * the swap type part is extracted. This will mask all bits
- * from the initial ~0UL that can't be encoded in either the
- * swp_entry_t or the architecture definition of a swap pte.
- */
- if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+ if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
goto out;
}
error = bd_claim(bdev, sys_swapon);
if (error < 0) {
bdev = NULL;
+ error = -EINVAL;
goto bad_swap;
}
p->old_block_size = block_size(bdev);
p->bdev = bdev;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
did_down = 1;
if (IS_SWAPFILE(inode)) {
error = -EBUSY;
error = -EINVAL;
goto bad_swap;
}
- page = read_cache_page(mapping, 0,
- (filler_t *)mapping->a_ops->readpage, swap_file);
+ page = read_mapping_page(mapping, 0, swap_file);
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto bad_swap;
}
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- goto bad_swap;
kmap(page);
swap_header = page_address(page);
else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
swap_header_version = 2;
else {
- printk("Unable to find swap-space signature\n");
+ printk(KERN_ERR "Unable to find swap-space signature\n");
error = -EINVAL;
goto bad_swap;
}
error = -EINVAL;
if (!maxpages)
goto bad_swap;
+ if (swapfilesize && maxpages > swapfilesize) {
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
+ goto bad_swap;
+ }
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
goto bad_swap;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
goto bad_swap;
-
+
/* OK, set up the swap map and apply the bad block list */
if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
error = -ENOMEM;
error = 0;
memset(p->swap_map, 0, maxpages * sizeof(short));
- for (i=0; i<swap_header->info.nr_badpages; i++) {
- int page = swap_header->info.badpages[i];
- if (page <= 0 || page >= swap_header->info.last_page)
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ int page_nr = swap_header->info.badpages[i];
+ if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
error = -EINVAL;
else
- p->swap_map[page] = SWAP_MAP_BAD;
+ p->swap_map[page_nr] = SWAP_MAP_BAD;
}
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
1 /* header page */;
- if (error)
+ if (error)
goto bad_swap;
}
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- error = -EINVAL;
- goto bad_swap;
- }
if (nr_good_pages) {
p->swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
goto bad_swap;
}
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
swap_info[prev].next = p - swap_info;
}
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
error = 0;
goto out;
bad_swap:
if (did_down) {
if (!error)
inode->i_flags |= S_SWAPFILE;
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
return error;
}
unsigned long offset, type;
int result = 0;
+ if (is_migration_entry(entry))
+ return 1;
+
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_file;
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
- int ret = 0, i = 1 << page_cluster;
- unsigned long toff;
- struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+ struct swap_info_struct *si;
+ int our_page_cluster = page_cluster;
+ pgoff_t target, toff;
+ pgoff_t base, end;
+ int nr_pages = 0;
- if (!page_cluster) /* no readahead */
+ if (!our_page_cluster) /* no readahead */
return 0;
- toff = (swp_offset(entry) >> page_cluster) << page_cluster;
- if (!toff) /* first page is swap header */
- toff++, i--;
- *offset = toff;
+
+ si = &swap_info[swp_type(entry)];
+ target = swp_offset(entry);
+ base = (target >> our_page_cluster) << our_page_cluster;
+ end = base + (1 << our_page_cluster);
+ if (!base) /* first page is swap header */
+ base++;
spin_lock(&swap_lock);
- do {
- /* Don't read-ahead past the end of the swap area */
- if (toff >= swapdev->max)
+ if (end > si->max) /* don't go beyond end of map */
+ end = si->max;
+
+ /* Count contiguous allocated slots above our target */
+ for (toff = target; ++toff < end; nr_pages++) {
+ /* Don't read in free or bad pages */
+ if (!si->swap_map[toff])
+ break;
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
break;
+ }
+ /* Count contiguous allocated slots below our target */
+ for (toff = target; --toff >= base; nr_pages++) {
/* Don't read in free or bad pages */
- if (!swapdev->swap_map[toff])
+ if (!si->swap_map[toff])
break;
- if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
break;
- toff++;
- ret++;
- } while (--i);
+ }
spin_unlock(&swap_lock);
- return ret;
+
+ /*
+ * Indicate starting offset, and return number of pages to get:
+ * if only 1, say 0, since there's then no readahead to be done.
+ */
+ *offset = ++toff;
+ return nr_pages? ++nr_pages: 0;
}