Merge commit 'v2.6.39' into for-2.6.40/core
Jens Axboe [Fri, 20 May 2011 18:33:15 +0000 (20:33 +0200)]
Since for-2.6.40/core was forked off the 2.6.39 devel tree, we've
had churn in the core area that makes it difficult to handle
patches for eg cfq or blk-throttle. Instead of requiring that they
be based in older versions with bugs that have been fixed later
in the rc cycle, merge in 2.6.39 final.

Also fixes up conflicts in the below files.

Conflicts:
drivers/block/paride/pcd.c
drivers/cdrom/viocd.c
drivers/ide/ide-cd.c

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

17 files changed:
Documentation/ABI/testing/sysfs-block
block/blk-exec.c
block/blk-flush.c
block/blk-lib.c
block/blk-settings.c
block/blk-sysfs.c
block/blk.h
block/elevator.c
drivers/ata/libata-scsi.c
drivers/block/paride/pcd.c
drivers/cdrom/viocd.c
drivers/ide/ide-cd.c
drivers/scsi/sr.c
fs/block_dev.c
fs/partitions/check.c
include/linux/blkdev.h
include/linux/genhd.h

index 4873c75..c1eb41c 100644 (file)
@@ -142,3 +142,67 @@ Description:
                with the previous I/O request are enabled. When set to 2,
                all merge tries are disabled. The default value is 0 -
                which enables all types of merge tries.
+
+What:          /sys/block/<disk>/discard_alignment
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space in units that are bigger than
+               the exported logical block size. The discard_alignment
+               parameter indicates how many bytes the beginning of the
+               device is offset from the internal allocation unit's
+               natural alignment.
+
+What:          /sys/block/<disk>/<partition>/discard_alignment
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space in units that are bigger than
+               the exported logical block size. The discard_alignment
+               parameter indicates how many bytes the beginning of the
+               partition is offset from the internal allocation unit's
+               natural alignment.
+
+What:          /sys/block/<disk>/queue/discard_granularity
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space using units that are bigger
+               than the logical block size. The discard_granularity
+               parameter indicates the size of the internal allocation
+               unit in bytes if reported by the device. Otherwise the
+               discard_granularity will be set to match the device's
+               physical block size. A discard_granularity of 0 means
+               that the device does not support discard functionality.
+
+What:          /sys/block/<disk>/queue/discard_max_bytes
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may have
+               internal limits on the number of bytes that can be
+               trimmed or unmapped in a single operation. Some storage
+               protocols also have inherent limits on the number of
+               blocks that can be described in a single command. The
+               discard_max_bytes parameter is set by the device driver
+               to the maximum number of bytes that can be discarded in
+               a single operation. Discard requests issued to the
+               device must not exceed this limit. A discard_max_bytes
+               value of 0 means that the device does not support
+               discard functionality.
+
+What:          /sys/block/<disk>/queue/discard_zeroes_data
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may return
+               stale or random data when a previously discarded block
+               is read back. This can cause problems if the filesystem
+               expects discarded blocks to be explicitly cleared. If a
+               device reports that it deterministically returns zeroes
+               when a discarded area is read the discard_zeroes_data
+               parameter will be set to one. Otherwise it will be 0 and
+               the result of reading a discarded area is undefined.
index 81e3181..8a0e7ec 100644 (file)
@@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
        spin_lock_irq(q->queue_lock);
        __elv_add_request(q, rq, where);
        __blk_run_queue(q);
-       /* the queue is stopped so it won't be plugged+unplugged */
+       /* the queue is stopped so it won't be run */
        if (rq->cmd_type == REQ_TYPE_PM_RESUME)
                q->request_fn(q);
        spin_unlock_irq(q->queue_lock);
index 6c9b5e1..bb21e4c 100644 (file)
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error)
        }
 
        /*
-        * Moving a request silently to empty queue_head may stall the
-        * queue.  Kick the queue in those cases.  This function is called
-        * from request completion path and calling directly into
-        * request_fn may confuse the driver.  Always use kblockd.
+        * Kick the queue to avoid stall for two cases:
+        * 1. Moving a request silently to empty queue_head may stall the
+        * queue.
+        * 2. When flush request is running in non-queueable queue, the
+        * queue is hold. Restart the queue after flush request is finished
+        * to avoid stall.
+        * This function is called from request completion path and calling
+        * directly into request_fn may confuse the driver.  Always use
+        * kblockd.
         */
-       if (queued)
+       if (queued || q->flush_queue_delayed)
                blk_run_queue_async(q);
+       q->flush_queue_delayed = 0;
 }
 
 /**
index 25de73e..78e627e 100644 (file)
@@ -9,17 +9,20 @@
 
 #include "blk.h"
 
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-       }
+struct bio_batch {
+       atomic_t                done;
+       unsigned long           flags;
+       struct completion       *wait;
+};
 
-       if (bio->bi_private)
-               complete(bio->bi_private);
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+       struct bio_batch *bb = bio->bi_private;
 
+       if (err && (err != -EOPNOTSUPP))
+               clear_bit(BIO_UPTODATE, &bb->flags);
+       if (atomic_dec_and_test(&bb->done))
+               complete(bb->wait);
        bio_put(bio);
 }
 
@@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        struct request_queue *q = bdev_get_queue(bdev);
        int type = REQ_WRITE | REQ_DISCARD;
        unsigned int max_discard_sectors;
+       struct bio_batch bb;
        struct bio *bio;
        int ret = 0;
 
@@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                type |= REQ_SECURE;
        }
 
-       while (nr_sects && !ret) {
+       atomic_set(&bb.done, 1);
+       bb.flags = 1 << BIO_UPTODATE;
+       bb.wait = &wait;
+
+       while (nr_sects) {
                bio = bio_alloc(gfp_mask, 1);
                if (!bio) {
                        ret = -ENOMEM;
@@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                }
 
                bio->bi_sector = sector;
-               bio->bi_end_io = blkdev_discard_end_io;
+               bio->bi_end_io = bio_batch_end_io;
                bio->bi_bdev = bdev;
-               bio->bi_private = &wait;
+               bio->bi_private = &bb;
 
                if (nr_sects > max_discard_sectors) {
                        bio->bi_size = max_discard_sectors << 9;
@@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                        nr_sects = 0;
                }
 
-               bio_get(bio);
+               atomic_inc(&bb.done);
                submit_bio(type, bio);
+       }
 
+       /* Wait for bios in-flight */
+       if (!atomic_dec_and_test(&bb.done))
                wait_for_completion(&wait);
 
-               if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                       ret = -EOPNOTSUPP;
-               else if (!bio_flagged(bio, BIO_UPTODATE))
-                       ret = -EIO;
-               bio_put(bio);
-       }
+       if (!test_bit(BIO_UPTODATE, &bb.flags))
+               ret = -EIO;
 
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
-struct bio_batch
-{
-       atomic_t                done;
-       unsigned long           flags;
-       struct completion       *wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-       struct bio_batch *bb = bio->bi_private;
-
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       set_bit(BIO_EOPNOTSUPP, &bb->flags);
-               else
-                       clear_bit(BIO_UPTODATE, &bb->flags);
-       }
-       if (bb)
-               if (atomic_dec_and_test(&bb->done))
-                       complete(bb->wait);
-       bio_put(bio);
-}
-
 /**
  * blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:      blockdev to issue
@@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
        bb.flags = 1 << BIO_UPTODATE;
        bb.wait = &wait;
 
-submit:
        ret = 0;
        while (nr_sects != 0) {
                bio = bio_alloc(gfp_mask,
@@ -168,9 +151,6 @@ submit:
 
                while (nr_sects != 0) {
                        sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-                       if (sz == 0)
-                               /* bio has maximum size possible */
-                               break;
                        ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
                        nr_sects -= ret >> 9;
                        sector += ret >> 9;
@@ -190,16 +170,6 @@ submit:
                /* One of bios in the batch was completed with error.*/
                ret = -EIO;
 
-       if (ret)
-               goto out;
-
-       if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
-               ret = -EOPNOTSUPP;
-               goto out;
-       }
-       if (nr_sects != 0)
-               goto submit;
-out:
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
index 1fa7692..fa1eb04 100644 (file)
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->discard_granularity = 0;
        lim->discard_alignment = 0;
        lim->discard_misaligned = 0;
-       lim->discard_zeroes_data = -1;
+       lim->discard_zeroes_data = 1;
        lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
        lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
        lim->alignment_offset = 0;
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 
        blk_set_default_limits(&q->limits);
        blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+       q->limits.discard_zeroes_data = 0;
 
        /*
         * by default assume old behaviour and bounce for any highmem page
@@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush);
 
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+       q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
 static int __init blk_settings_init(void)
 {
        blk_max_low_pfn = max_low_pfn - 1;
index bd23631..d935bd8 100644 (file)
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 {
-       return queue_var_show(q->limits.max_discard_sectors << 9, page);
+       return sprintf(page, "%llu\n",
+                      (unsigned long long)q->limits.max_discard_sectors << 9);
 }
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
index 6126346..1f798b5 100644 (file)
@@ -61,7 +61,26 @@ static inline struct request *__elv_next_request(struct request_queue *q)
                        rq = list_entry_rq(q->queue_head.next);
                        return rq;
                }
-
+               /*
+                * Flush request is running and flush request isn't queueable
+                * in the drive, we can hold the queue till flush request is
+                * finished. Even we don't do this, driver can't dispatch next
+                * requests and will requeue them. And this can improve
+                * throughput too. For example, we have request flush1, write1,
+                * flush 2. flush1 is dispatched, then queue is hold, write1
+                * isn't inserted to queue. After flush1 is finished, flush2
+                * will be dispatched. Since disk cache is already clean,
+                * flush2 will be finished very soon, so looks like flush2 is
+                * folded to flush1.
+                * Since the queue is hold, a flag is set to indicate the queue
+                * should be restarted later. Please see flush_end_io() for
+                * details.
+                */
+               if (q->flush_pending_idx != q->flush_running_idx &&
+                               !queue_flush_queueable(q)) {
+                       q->flush_queue_delayed = 1;
+                       return NULL;
+               }
                if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
                        return NULL;
        }
index 45ca1e3..2a0b653 100644 (file)
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
 
        e = elevator_find(name);
        if (!e) {
-               char elv[ELV_NAME_MAX + strlen("-iosched")];
-
                spin_unlock(&elv_list_lock);
-
-               snprintf(elv, sizeof(elv), "%s-iosched", name);
-
-               request_module("%s", elv);
+               request_module("%s-iosched", name);
                spin_lock(&elv_list_lock);
                e = elevator_find(name);
        }
index e2f57e9..dad2c95 100644 (file)
@@ -1089,21 +1089,21 @@ static int atapi_drain_needed(struct request *rq)
 static int ata_scsi_dev_config(struct scsi_device *sdev,
                               struct ata_device *dev)
 {
+       struct request_queue *q = sdev->request_queue;
+
        if (!ata_id_has_unload(dev->id))
                dev->flags |= ATA_DFLAG_NO_UNLOAD;
 
        /* configure max sectors */
-       blk_queue_max_hw_sectors(sdev->request_queue, dev->max_sectors);
+       blk_queue_max_hw_sectors(q, dev->max_sectors);
 
        if (dev->class == ATA_DEV_ATAPI) {
-               struct request_queue *q = sdev->request_queue;
                void *buf;
 
                sdev->sector_size = ATA_SECT_SIZE;
 
                /* set DMA padding */
-               blk_queue_update_dma_pad(sdev->request_queue,
-                                        ATA_DMA_PAD_SZ - 1);
+               blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1);
 
                /* configure draining */
                buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL);
@@ -1131,8 +1131,7 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
                        "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
                        sdev->sector_size);
 
-       blk_queue_update_dma_alignment(sdev->request_queue,
-                                      sdev->sector_size - 1);
+       blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
 
        if (dev->flags & ATA_DFLAG_AN)
                set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -1145,6 +1144,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
                scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
        }
 
+       blk_queue_flush_queueable(q, false);
+
        dev->sdev = sdev;
        return 0;
 }
index 8690e31..a0aabd9 100644 (file)
@@ -320,6 +320,8 @@ static void pcd_init_units(void)
                disk->first_minor = unit;
                strcpy(disk->disk_name, cd->name);      /* umm... */
                disk->fops = &pcd_bdops;
+               disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+               disk->events = DISK_EVENT_MEDIA_CHANGE;
        }
 }
 
index e427fbe..ae15a4d 100644 (file)
@@ -625,7 +625,9 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        blk_queue_max_hw_sectors(q, 4096 / 512);
        gendisk->queue = q;
        gendisk->fops = &viocd_fops;
-       gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE;
+       gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE |
+                        GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       gendisk->events = DISK_EVENT_MEDIA_CHANGE;
        set_capacity(gendisk, 0);
        gendisk->private_data = d;
        d->viocd_disk = gendisk;
index a5ec5a7..6e5123b 100644 (file)
@@ -1781,7 +1781,8 @@ static int ide_cd_probe(ide_drive_t *drive)
 
        ide_cd_read_toc(drive, &sense);
        g->fops = &idecd_ops;
-       g->flags |= GENHD_FL_REMOVABLE;
+       g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       g->events = DISK_EVENT_MEDIA_CHANGE;
        add_disk(g);
        return 0;
 
index 95019c7..4778e27 100644 (file)
@@ -636,7 +636,7 @@ static int sr_probe(struct device *dev)
        disk->first_minor = minor;
        sprintf(disk->disk_name, "sr%d", minor);
        disk->fops = &sr_bdops;
-       disk->flags = GENHD_FL_CD;
+       disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
        disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
 
        blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
index 257b00e..d7c2e0f 100644 (file)
@@ -1237,6 +1237,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
 
        if (whole) {
+               struct gendisk *disk = whole->bd_disk;
+
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
                spin_lock(&bdev_lock);
@@ -1263,15 +1265,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                spin_unlock(&bdev_lock);
 
                /*
-                * Block event polling for write claims.  Any write
-                * holder makes the write_holder state stick until all
-                * are released.  This is good enough and tracking
-                * individual writeable reference is too fragile given
-                * the way @mode is used in blkdev_get/put().
+                * Block event polling for write claims if requested.  Any
+                * write holder makes the write_holder state stick until
+                * all are released.  This is good enough and tracking
+                * individual writeable reference is too fragile given the
+                * way @mode is used in blkdev_get/put().
                 */
-               if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+               if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                   !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
                        bdev->bd_write_holder = true;
-                       disk_block_events(bdev->bd_disk);
+                       disk_block_events(disk);
                }
 
                mutex_unlock(&bdev->bd_mutex);
index d545e97..8ed4d34 100644 (file)
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
-       return sprintf(buf, "%u\n", p->discard_alignment);
+       struct gendisk *disk = dev_to_disk(dev);
+
+       return sprintf(buf, "%u\n",
+                       queue_limit_discard_alignment(&disk->queue->limits,
+                                                       p->start_sect));
 }
 
 ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset =
                queue_limit_alignment_offset(&disk->queue->limits, start);
-       p->discard_alignment =
-               queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
index 2ad95fa..ae9091a 100644 (file)
@@ -257,7 +257,7 @@ struct queue_limits {
        unsigned char           misaligned;
        unsigned char           discard_misaligned;
        unsigned char           cluster;
-       signed char             discard_zeroes_data;
+       unsigned char           discard_zeroes_data;
 };
 
 struct request_queue
@@ -364,6 +364,8 @@ struct request_queue
         * for flush operations
         */
        unsigned int            flush_flags;
+       unsigned int            flush_not_queueable:1;
+       unsigned int            flush_queue_delayed:1;
        unsigned int            flush_pending_idx:1;
        unsigned int            flush_running_idx:1;
        unsigned long           flush_pending_since;
@@ -843,6 +845,7 @@ extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
+extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
@@ -1066,13 +1069,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
 {
        unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
 
+       if (!lim->max_discard_sectors)
+               return 0;
+
        return (lim->discard_granularity + lim->discard_alignment - alignment)
                & (lim->discard_granularity - 1);
 }
 
 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
 {
-       if (q->limits.discard_zeroes_data == 1)
+       if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
                return 1;
 
        return 0;
@@ -1111,6 +1117,11 @@ static inline unsigned int block_size(struct block_device *bdev)
        return bdev->bd_block_size;
 }
 
+static inline bool queue_flush_queueable(struct request_queue *q)
+{
+       return !q->flush_not_queueable;
+}
+
 typedef struct {struct page *v;} Sector;
 
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
index d764a42..b78956b 100644 (file)
@@ -100,7 +100,6 @@ struct hd_struct {
        sector_t start_sect;
        sector_t nr_sects;
        sector_t alignment_offset;
-       unsigned int discard_alignment;
        struct device __dev;
        struct kobject *holder_dir;
        int policy, partno;
@@ -127,6 +126,7 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO       32
 #define GENHD_FL_EXT_DEVT                      64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY               128
+#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE    256
 
 enum {
        DISK_EVENT_MEDIA_CHANGE                 = 1 << 0, /* media changed */