xen-blkback: Implement discard requests ('feature-discard')
Li Dongyang [Thu, 1 Sep 2011 10:39:10 +0000 (18:39 +0800)]
..aka ATA TRIM/SCSI UNMAP command to be passed through the frontend
and used as appropiately by the backend. We also advertise
certain granulity parameters to the frontend so it can plug them in.
If the backend is a realy device - we just end up using
'blkdev_issue_discard' while for loopback devices - we just punch
a hole in the image file.

Signed-off-by: Li Dongyang <lidongyang@novell.com>
[v1: Fixed up pr_debug and commit description]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

drivers/block/xen-blkback/blkback.c
drivers/block/xen-blkback/common.h
drivers/block/xen-blkback/xenbus.c

index 2330a9a..9713d5a 100644 (file)
@@ -39,6 +39,9 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
+#include <linux/loop.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
 
 #include <xen/events.h>
 #include <xen/page.h>
@@ -258,13 +261,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct xen_blkif *blkif)
 {
-       pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
+       pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d"
+                "  |  ds %4d\n",
                 current->comm, blkif->st_oo_req,
-                blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
+                blkif->st_rd_req, blkif->st_wr_req,
+                blkif->st_f_req, blkif->st_ds_req);
        blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
        blkif->st_rd_req = 0;
        blkif->st_wr_req = 0;
        blkif->st_oo_req = 0;
+       blkif->st_ds_req = 0;
 }
 
 int xen_blkif_schedule(void *arg)
@@ -410,6 +416,42 @@ static int xen_blkbk_map(struct blkif_request *req,
        return ret;
 }
 
+static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
+{
+       int err = 0;
+       int status = BLKIF_RSP_OKAY;
+       struct block_device *bdev = blkif->vbd.bdev;
+
+       if (blkif->blk_backend_type == BLKIF_BACKEND_PHY)
+               /* just forward the discard request */
+               err = blkdev_issue_discard(bdev,
+                               req->u.discard.sector_number,
+                               req->u.discard.nr_sectors,
+                               GFP_KERNEL, 0);
+       else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
+               /* punch a hole in the backing file */
+               struct loop_device *lo = bdev->bd_disk->private_data;
+               struct file *file = lo->lo_backing_file;
+
+               if (file->f_op->fallocate)
+                       err = file->f_op->fallocate(file,
+                               FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+                               req->u.discard.sector_number << 9,
+                               req->u.discard.nr_sectors << 9);
+               else
+                       err = -EOPNOTSUPP;
+       } else
+               err = -EOPNOTSUPP;
+
+       if (err == -EOPNOTSUPP) {
+               pr_debug(DRV_PFX "discard op failed, not supported\n");
+               status = BLKIF_RSP_EOPNOTSUPP;
+       } else if (err)
+               status = BLKIF_RSP_ERROR;
+
+       make_response(blkif, req->id, req->operation, status);
+}
+
 /*
  * Completion callback on the bio's. Called as bh->b_end_io()
  */
@@ -563,6 +605,10 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                blkif->st_f_req++;
                operation = WRITE_FLUSH;
                break;
+       case BLKIF_OP_DISCARD:
+               blkif->st_ds_req++;
+               operation = REQ_DISCARD;
+               break;
        case BLKIF_OP_WRITE_BARRIER:
        default:
                operation = 0; /* make gcc happy */
@@ -572,7 +618,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
        /* Check that the number of segments is sane. */
        nseg = req->nr_segments;
-       if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
+       if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
+                               operation != REQ_DISCARD) ||
            unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
                pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
                         nseg);
@@ -627,10 +674,14 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         * the hypercall to unmap the grants - that is all done in
         * xen_blkbk_unmap.
         */
-       if (xen_blkbk_map(req, pending_req, seg))
+       if (operation != BLKIF_OP_DISCARD &&
+                       xen_blkbk_map(req, pending_req, seg))
                goto fail_flush;
 
-       /* This corresponding xen_blkif_put is done in __end_block_io_op */
+       /*
+        * This corresponding xen_blkif_put is done in __end_block_io_op, or
+        * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+        */
        xen_blkif_get(blkif);
 
        for (i = 0; i < nseg; i++) {
@@ -654,18 +705,25 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                preq.sector_number += seg[i].nsec;
        }
 
-       /* This will be hit if the operation was a flush. */
+       /* This will be hit if the operation was a flush or discard. */
        if (!bio) {
-               BUG_ON(operation != WRITE_FLUSH);
+               BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD);
 
-               bio = bio_alloc(GFP_KERNEL, 0);
-               if (unlikely(bio == NULL))
-                       goto fail_put_bio;
+               if (operation == WRITE_FLUSH) {
+                       bio = bio_alloc(GFP_KERNEL, 0);
+                       if (unlikely(bio == NULL))
+                               goto fail_put_bio;
 
-               biolist[nbio++] = bio;
-               bio->bi_bdev    = preq.bdev;
-               bio->bi_private = pending_req;
-               bio->bi_end_io  = end_block_io_op;
+                       biolist[nbio++] = bio;
+                       bio->bi_bdev    = preq.bdev;
+                       bio->bi_private = pending_req;
+                       bio->bi_end_io  = end_block_io_op;
+               } else if (operation == REQ_DISCARD) {
+                       xen_blk_discard(blkif, req);
+                       xen_blkif_put(blkif);
+                       free_req(pending_req);
+                       return 0;
+               }
        }
 
        /*
index 9e40b28..bfb532e 100644 (file)
@@ -63,13 +63,26 @@ struct blkif_common_response {
 
 /* i386 protocol version */
 #pragma pack(push, 4)
+
+struct blkif_x86_32_request_rw {
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+struct blkif_x86_32_request_discard {
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       uint64_t nr_sectors;
+};
+
 struct blkif_x86_32_request {
        uint8_t        operation;    /* BLKIF_OP_???                         */
        uint8_t        nr_segments;  /* number of segments                   */
        blkif_vdev_t   handle;       /* only for read/write requests         */
        uint64_t       id;           /* private guest value, echoed in resp  */
-       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       union {
+               struct blkif_x86_32_request_rw rw;
+               struct blkif_x86_32_request_discard discard;
+       } u;
 };
 struct blkif_x86_32_response {
        uint64_t        id;              /* copied from request */
@@ -79,13 +92,26 @@ struct blkif_x86_32_response {
 #pragma pack(pop)
 
 /* x86_64 protocol version */
+
+struct blkif_x86_64_request_rw {
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+struct blkif_x86_64_request_discard {
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       uint64_t nr_sectors;
+};
+
 struct blkif_x86_64_request {
        uint8_t        operation;    /* BLKIF_OP_???                         */
        uint8_t        nr_segments;  /* number of segments                   */
        blkif_vdev_t   handle;       /* only for read/write requests         */
        uint64_t       __attribute__((__aligned__(8))) id;
-       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       union {
+               struct blkif_x86_64_request_rw rw;
+               struct blkif_x86_64_request_discard discard;
+       } u;
 };
 struct blkif_x86_64_response {
        uint64_t       __attribute__((__aligned__(8))) id;
@@ -113,6 +139,11 @@ enum blkif_protocol {
        BLKIF_PROTOCOL_X86_64 = 3,
 };
 
+enum blkif_backend_type {
+       BLKIF_BACKEND_PHY  = 1,
+       BLKIF_BACKEND_FILE = 2,
+};
+
 struct xen_vbd {
        /* What the domain refers to this vbd as. */
        blkif_vdev_t            handle;
@@ -138,6 +169,7 @@ struct xen_blkif {
        unsigned int            irq;
        /* Comms information. */
        enum blkif_protocol     blk_protocol;
+       enum blkif_backend_type blk_backend_type;
        union blkif_back_rings  blk_rings;
        struct vm_struct        *blk_ring_area;
        /* The VBD attached to this interface. */
@@ -159,6 +191,7 @@ struct xen_blkif {
        int                     st_wr_req;
        int                     st_oo_req;
        int                     st_f_req;
+       int                     st_ds_req;
        int                     st_rd_sect;
        int                     st_wr_sect;
 
@@ -182,7 +215,7 @@ struct xen_blkif {
 
 struct phys_req {
        unsigned short          dev;
-       unsigned short          nr_sects;
+       blkif_sector_t          nr_sects;
        struct block_device     *bdev;
        blkif_sector_t          sector_number;
 };
@@ -206,12 +239,25 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
        dst->nr_segments = src->nr_segments;
        dst->handle = src->handle;
        dst->id = src->id;
-       dst->u.rw.sector_number = src->sector_number;
-       barrier();
-       if (n > dst->nr_segments)
-               n = dst->nr_segments;
-       for (i = 0; i < n; i++)
-               dst->u.rw.seg[i] = src->seg[i];
+       switch (src->operation) {
+       case BLKIF_OP_READ:
+       case BLKIF_OP_WRITE:
+       case BLKIF_OP_WRITE_BARRIER:
+       case BLKIF_OP_FLUSH_DISKCACHE:
+               dst->u.rw.sector_number = src->u.rw.sector_number;
+               barrier();
+               if (n > dst->nr_segments)
+                       n = dst->nr_segments;
+               for (i = 0; i < n; i++)
+                       dst->u.rw.seg[i] = src->u.rw.seg[i];
+               break;
+       case BLKIF_OP_DISCARD:
+               dst->u.discard.sector_number = src->u.discard.sector_number;
+               dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+               break;
+       default:
+               break;
+       }
 }
 
 static inline void blkif_get_x86_64_req(struct blkif_request *dst,
@@ -222,12 +268,25 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
        dst->nr_segments = src->nr_segments;
        dst->handle = src->handle;
        dst->id = src->id;
-       dst->u.rw.sector_number = src->sector_number;
-       barrier();
-       if (n > dst->nr_segments)
-               n = dst->nr_segments;
-       for (i = 0; i < n; i++)
-               dst->u.rw.seg[i] = src->seg[i];
+       switch (src->operation) {
+       case BLKIF_OP_READ:
+       case BLKIF_OP_WRITE:
+       case BLKIF_OP_WRITE_BARRIER:
+       case BLKIF_OP_FLUSH_DISKCACHE:
+               dst->u.rw.sector_number = src->u.rw.sector_number;
+               barrier();
+               if (n > dst->nr_segments)
+                       n = dst->nr_segments;
+               for (i = 0; i < n; i++)
+                       dst->u.rw.seg[i] = src->u.rw.seg[i];
+               break;
+       case BLKIF_OP_DISCARD:
+               dst->u.discard.sector_number = src->u.discard.sector_number;
+               dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+               break;
+       default:
+               break;
+       }
 }
 
 #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
index 3f129b4..2b3aef0 100644 (file)
@@ -272,6 +272,7 @@ VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
 VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
 VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
 VBD_SHOW(f_req,  "%d\n", be->blkif->st_f_req);
+VBD_SHOW(ds_req,  "%d\n", be->blkif->st_ds_req);
 VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
 VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
 
@@ -280,6 +281,7 @@ static struct attribute *xen_vbdstat_attrs[] = {
        &dev_attr_rd_req.attr,
        &dev_attr_wr_req.attr,
        &dev_attr_f_req.attr,
+       &dev_attr_ds_req.attr,
        &dev_attr_rd_sect.attr,
        &dev_attr_wr_sect.attr,
        NULL
@@ -419,6 +421,60 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
        return err;
 }
 
+int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       struct xen_blkif *blkif = be->blkif;
+       char *type;
+       int err;
+       int state = 0;
+
+       type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL);
+       if (!IS_ERR(type)) {
+               if (strncmp(type, "file", 4) == 0) {
+                       state = 1;
+                       blkif->blk_backend_type = BLKIF_BACKEND_FILE;
+               }
+               if (strncmp(type, "phy", 3) == 0) {
+                       struct block_device *bdev = be->blkif->vbd.bdev;
+                       struct request_queue *q = bdev_get_queue(bdev);
+                       if (blk_queue_discard(q)) {
+                               err = xenbus_printf(xbt, dev->nodename,
+                                       "discard-granularity", "%u",
+                                       q->limits.discard_granularity);
+                               if (err) {
+                                       xenbus_dev_fatal(dev, err,
+                                               "writing discard-granularity");
+                                       goto kfree;
+                               }
+                               err = xenbus_printf(xbt, dev->nodename,
+                                       "discard-alignment", "%u",
+                                       q->limits.discard_alignment);
+                               if (err) {
+                                       xenbus_dev_fatal(dev, err,
+                                               "writing discard-alignment");
+                                       goto kfree;
+                               }
+                               state = 1;
+                               blkif->blk_backend_type = BLKIF_BACKEND_PHY;
+                       }
+               }
+       } else {
+               err = PTR_ERR(type);
+               xenbus_dev_fatal(dev, err, "reading type");
+               goto out;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+                           "%d", state);
+       if (err)
+               xenbus_dev_fatal(dev, err, "writing feature-discard");
+kfree:
+       kfree(type);
+out:
+       return err;
+}
+
 /*
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures, and watch the store waiting for the hotplug scripts to tell us
@@ -650,6 +706,8 @@ again:
        if (err)
                goto abort;
 
+       err = xen_blkbk_discard(xbt, be);
+
        err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
                            (unsigned long long)vbd_sz(&be->blkif->vbd));
        if (err) {