thp: madvise(MADV_HUGEPAGE)
[linux-2.6.git] / mm / backing-dev.c
index 9989083..027100d 100644 (file)
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 struct backing_dev_info noop_backing_dev_info = {
        .name           = "noop",
+       .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
@@ -73,15 +74,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 
        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
        spin_lock(&inode_lock);
-       list_for_each_entry(inode, &wb->b_dirty, i_list)
+       list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
-       list_for_each_entry(inode, &wb->b_io, i_list)
+       list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
-       list_for_each_entry(inode, &wb->b_more_io, i_list)
+       list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
        spin_unlock(&inode_lock);
 
-       get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+       global_dirty_limits(&background_thresh, &dirty_thresh);
+       bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
@@ -242,6 +244,7 @@ static int __init default_bdi_init(void)
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
+       err = bdi_init(&noop_backing_dev_info);
 
        return err;
 }
@@ -310,6 +313,7 @@ static void wakeup_timer_fn(unsigned long data)
 
        spin_lock_bh(&bdi->wb_lock);
        if (bdi->wb.task) {
+               trace_writeback_wake_thread(bdi);
                wake_up_process(bdi->wb.task);
        } else {
                /*
@@ -317,6 +321,7 @@ static void wakeup_timer_fn(unsigned long data)
                 * In this case we have to wake-up the forker thread which
                 * should create and run the bdi thread.
                 */
+               trace_writeback_wake_forker_thread(bdi);
                wake_up_process(default_backing_dev_info.wb.task);
        }
        spin_unlock_bh(&bdi->wb_lock);
@@ -357,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
 
-       current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+       current->flags |= PF_SWAPWRITE;
        set_freezable();
 
        /*
@@ -413,7 +418,8 @@ static int bdi_forker_thread(void *ptr)
                                break;
                        }
 
-                       spin_lock_bh(&bdi->wb_lock);
+                       spin_lock(&bdi->wb_lock);
+
                        /*
                         * If there is no work to do and the bdi thread was
                         * inactive long enough - kill it. The wb_lock is taken
@@ -430,7 +436,7 @@ static int bdi_forker_thread(void *ptr)
                                action = KILL_THREAD;
                                break;
                        }
-                       spin_unlock_bh(&bdi->wb_lock);
+                       spin_unlock(&bdi->wb_lock);
                }
                spin_unlock_bh(&bdi_lock);
 
@@ -441,8 +447,8 @@ static int bdi_forker_thread(void *ptr)
                switch (action) {
                case FORK_THREAD:
                        __set_current_state(TASK_RUNNING);
-                       task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
-                                          dev_name(bdi->dev));
+                       task = kthread_create(bdi_writeback_thread, &bdi->wb,
+                                             "flush-%s", dev_name(bdi->dev));
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
@@ -453,10 +459,13 @@ static int bdi_forker_thread(void *ptr)
                                /*
                                 * The spinlock makes sure we do not lose
                                 * wake-ups when racing with 'bdi_queue_work()'.
+                                * And as soon as the bdi thread is visible, we
+                                * can start it.
                                 */
                                spin_lock_bh(&bdi->wb_lock);
                                bdi->wb.task = task;
                                spin_unlock_bh(&bdi->wb_lock);
+                               wake_up_process(task);
                        }
                        break;
 
@@ -509,23 +518,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...)
 {
        va_list args;
-       int ret = 0;
        struct device *dev;
 
        if (bdi->dev)   /* The driver needs to use separate queues per device */
-               goto exit;
+               return 0;
 
        va_start(args, fmt);
        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
-       if (IS_ERR(dev)) {
-               ret = PTR_ERR(dev);
-               goto exit;
-       }
-
-       spin_lock_bh(&bdi_lock);
-       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-       spin_unlock_bh(&bdi_lock);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
 
        bdi->dev = dev;
 
@@ -539,20 +541,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
                                                dev_name(dev));
-               if (IS_ERR(wb->task)) {
-                       wb->task = NULL;
-                       ret = -ENOMEM;
-
-                       bdi_remove_from_list(bdi);
-                       goto exit;
-               }
+               if (IS_ERR(wb->task))
+                       return PTR_ERR(wb->task);
        }
 
        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);
+
+       spin_lock_bh(&bdi_lock);
+       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+       spin_unlock_bh(&bdi_lock);
+
        trace_writeback_bdi_register(bdi);
-exit:
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL(bdi_register);
 
@@ -728,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
+static atomic_t nr_bdi_congested[2];
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
@@ -735,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
        wait_queue_head_t *wqh = &congestion_wqh[sync];
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       clear_bit(bit, &bdi->state);
+       if (test_and_clear_bit(bit, &bdi->state))
+               atomic_dec(&nr_bdi_congested[sync]);
        smp_mb__after_clear_bit();
        if (waitqueue_active(wqh))
                wake_up(wqh);
@@ -747,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
        enum bdi_state bit;
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       set_bit(bit, &bdi->state);
+       if (!test_and_set_bit(bit, &bdi->state))
+               atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
@@ -763,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
 long congestion_wait(int sync, long timeout)
 {
        long ret;
+       unsigned long start = jiffies;
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = &congestion_wqh[sync];
 
        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = io_schedule_timeout(timeout);
        finish_wait(wqh, &wait);
+
+       trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+                                       jiffies_to_usecs(jiffies - start));
+
        return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
 
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+       long ret;
+       unsigned long start = jiffies;
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+       /*
+        * If there is no congestion, or heavy congestion is not being
+        * encountered in the current zone, yield if necessary instead
+        * of sleeping on the congestion queue
+        */
+       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                       !zone_is_reclaim_congested(zone)) {
+               cond_resched();
+
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+               if (ret < 0)
+                       ret = 0;
+
+               goto out;
+       }
+
+       /* Sleep until uncongested or a write happens */
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       ret = io_schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+
+out:
+       trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                       jiffies_to_usecs(jiffies - start));
+
+       return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);